diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc2dee11add6c625f16cfae9e9b1c7b20533fb9d..941ee965b25102ae7a813da487fa451677f6f4a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 message(STATUS "AR tools: ${CMAKE_AR}")
 
+
+if(WIN32)
+    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+
+    set(CMAKE_SUPPRESS_REGENERATION ON)
+    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+
+    if (MSVC_STATIC_CRT)
+      set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+      set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+      set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+      set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    endif()
+
+    add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
+    add_compile_options(/MP)
+    message(STATUS "Using parallel compiling (/MP)")
+    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
+    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+
+endif()
+
 if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     find_package(CUDA QUIET)
 endif()
@@ -59,8 +84,10 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_RKNPU  "Enable RKNPU in lite mode"  OFF)
 lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
+lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
 lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
@@ -68,7 +95,7 @@ lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
 lite_option(LITE_WITH_FPGA   "Enable FPGA support in lite" OFF)
 lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)
 lite_option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
-lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF IF LITE_WITH_PROFILE)
+lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
 lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF)
 lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
@@ -104,9 +131,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
+    if(WIN32)
+        set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+        FORCE)
+    else()
+    
     set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
             "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
             FORCE)
+    endif()
 endif()
 message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
 
@@ -128,6 +162,10 @@ if (LITE_WITH_PYTHON)
     include(external/pybind11)    # download, build, install pybind11
 endif()
 
+if(LITE_WITH_RKNPU)
+   include(device/rknpu)
+endif()
+
 
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
@@ -184,6 +222,7 @@ endif()
 
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
+
 include(external/libxsmm)   # download, build, install libxsmm
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -208,7 +247,9 @@ include(generic)            # simplify cmake module
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(version)            # set PADDLE_VERSION
-include(flags)
+if(NOT APPLE)
+  include(flags)
+endif()
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/build.bat b/build.bat
new file mode 100644
index 0000000000000000000000000000000000000000..4510ee774ed9a3b9fe5a9d55b405b1dae39c3f45
--- /dev/null
+++ b/build.bat
@@ -0,0 +1,134 @@
+@echo off
+setlocal
+setlocal enabledelayedexpansion
+
+set source_path=%~dp0
+rem  global variables
+set BUILD_EXTRA=OFF
+set BUILD_JAVA=ON
+set BUILD_PYTHON=OFF
+set BUILD_DIR=%source_path%
+set OPTMODEL_DIR=""
+set BUILD_TAILOR=OFF
+set BUILD_CV=OFF
+set SHUTDOWN_LOG=ON  
+
+set THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+
+set workspace=%source_path%
+
+:set_vcvarsall_dir
+SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat   =======>"
+set tmp_var=!vcvarsall_dir!
+call:remove_space
+set vcvarsall_dir=!tmp_var!   
+IF NOT EXIST "%vcvarsall_dir%" (
+    echo "------------%vcvarsall_dir% not exist------------"
+    goto set_vcvarsall_dir
+)
+
+call:prepare_thirdparty
+
+if EXIST "%build_directory%" (
+    call:rm_rebuild_dir "%build_directory%"
+    md "%build_directory%"
+) 
+
+set root_dir=%workspace%
+set build_directory=%BUILD_DIR%\build.lite.x86
+set GEN_CODE_PATH_PREFIX=%build_directory%\lite\gen_code
+set DEBUG_TOOL_PATH_PREFIX=%build_directory%\lite\tools\debug
+
+rem for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
+rem here we fake an empty file to make cmake works.
+if NOT EXIST "%GEN_CODE_PATH_PREFIX%" (
+    md "%GEN_CODE_PATH_PREFIX%"
+)
+
+type nul >"%GEN_CODE_PATH_PREFIX%\__generated_code__.cc"
+
+if NOT EXIST "%DEBUG_TOOL_PATH_PREFIX%" (
+     md "%DEBUG_TOOL_PATH_PREFIX%"
+)
+
+copy "%root_dir%\lite\tools\debug\analysis_tool.py" "%DEBUG_TOOL_PATH_PREFIX%\"
+
+cd "%build_directory%"
+
+  cmake ..   -G "Visual Studio 14 2015 Win64" -T host=x64  -DWITH_MKL=ON      ^
+            -DWITH_MKLDNN=OFF   ^
+            -DLITE_WITH_X86=ON  ^
+            -DLITE_WITH_PROFILE=OFF ^
+            -DWITH_LITE=ON ^
+            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF ^
+            -DLITE_WITH_ARM=OFF ^
+            -DWITH_GPU=OFF ^
+            -DLITE_BUILD_EXTRA=ON ^
+            -DLITE_WITH_PYTHON=ON ^
+            -DPYTHON_EXECUTABLE="%python_path%"
+
+call "%vcvarsall_dir%" amd64
+
+msbuild /m /p:Configuration=Release lite\publish_inference.vcxproj >mylog.txt 2>&1
+goto:eof
+
+:prepare_thirdparty 
+    SET /P python_path="Please input the path of python.exe, such as C:\Python35\python.exe, C:\Python35\python3.exe   =======>"
+    set tmp_var=!python_path!
+    call:remove_space
+    set python_path=!tmp_var!   
+    if "!python_path!"=="" (
+      set python_path=python.exe
+    ) else (
+      if NOT exist "!python_path!" (
+        echo "------------!python_path! not exist------------" 
+        goto:eof
+      )  
+    )
+
+    if  EXIST "%workspace%\third-party" (
+        if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
+            echo "The directory of third_party exists, the third-party-05b862.tar.gz not exists."            
+        ) else (
+               echo "The directory of third_party exists, the third-party-05b862.tar.gz exists."
+               call:rm_rebuild_dir "%workspace%\third-party"
+               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        )
+    ) else (
+        if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
+            echo "The directory of third_party not exists, the third-party-05b862.tar.gz not exists."
+            call:download_third_party
+            !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        ) else (
+            echo "The directory of third_party not exists, the third-party-05b862.tar.gz exists."
+               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        )
+
+    )
+    git submodule update --init --recursive
+goto:eof
+
+:download_third_party
+powershell.exe (new-object System.Net.WebClient).DownloadFile('https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz', ^
+'%workspace%third-party-05b862.tar.gz')
+goto:eof
+
+:rm_rebuild_dir
+    del /f /s /q "%~1\*.*"  >nul 2>&1
+    rd /s /q  "%~1" >nul 2>&1
+goto:eof
+
+
+:remove_space
+:remove_left_space
+if "%tmp_var:~0,1%"==" " (
+    set "tmp_var=%tmp_var:~1%"
+    goto remove_left_space
+)
+
+:remove_right_space
+if "%tmp_var:~-1%"==" " (
+    set "tmp_var=%tmp_var:~0,-1%"
+    goto remove_left_space
+)
+goto:eof
\ No newline at end of file
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 80c59f19cc4a587b1c33ad796740a4d148a7ec46..1c5b58def0d9383dabf3b5d7f814e96617f8f3b8 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -34,6 +34,15 @@ elseif(SSE3_FOUND)
     set(SIMD_FLAG ${SSE3_FLAG})
 endif()
 
+if(WIN32)
+  # windows header option for all targets.
+  add_definitions(-D_XKEYCHECK_H)
+  
+  if (NOT MSVC)
+    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
+  endif(NOT MSVC)
+endif(WIN32)
+
 if(LITE_WITH_CUDA)
     add_definitions(-DLITE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
@@ -70,7 +79,7 @@ endif()
 
 if (WITH_MKLML AND MKLML_IOMP_LIB)
     message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if(WIN32)
+    if(WIN32 OR APPLE)
         # openmp not support well for now on windows
         set(OPENMP_FLAGS "")
     else(WIN32)
@@ -134,8 +143,15 @@ if (LITE_WITH_NPU)
     add_definitions("-DLITE_WITH_NPU")
 endif()
 
+if (LITE_WITH_RKNPU)
+    add_definitions("-DLITE_WITH_RKNPU")
+endif()
+
 if (LITE_WITH_XPU)
     add_definitions("-DLITE_WITH_XPU")
+    if (LITE_WITH_XTCL)
+      add_definitions("-DLITE_WITH_XTCL")
+    endif()
 endif()
 
 if (LITE_WITH_OPENCL)
@@ -156,9 +172,10 @@ endif()
 
 if (LITE_WITH_PROFILE)
     add_definitions("-DLITE_WITH_PROFILE")
-    if (LITE_WITH_PRECISION_PROFILE)
-        add_definitions("-DLITE_WITH_PRECISION_PROFILE")
-    endif()
+endif()
+
+if (LITE_WITH_PRECISION_PROFILE)
+    add_definitions("-DLITE_WITH_PRECISION_PROFILE")
 endif()
 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
@@ -177,3 +194,6 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
   add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL")
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 
+if (LITE_WITH_PYTHON)
+  add_definitions("-DLITE_WITH_PYTHON")
+endif(LITE_WITH_PYTHON)
diff --git a/cmake/device/rknpu.cmake b/cmake/device/rknpu.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7d430888072b0219bba3112534818d2e10a55579
--- /dev/null
+++ b/cmake/device/rknpu.cmake
@@ -0,0 +1,55 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+
+if(NOT DEFINED RKNPU_DDK_ROOT)
+    set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT})
+    if(NOT RKNPU_DDK_ROOT)
+        message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON")
+    endif()
+endif()
+
+message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}")
+find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h
+  PATHS ${RKNPU_DDK_ROOT}/include/  NO_DEFAULT_PATH)
+if(NOT RKNPU_DDK_INC)
+  message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include")
+endif()
+
+include_directories("${RKNPU_DDK_ROOT}/include")
+
+set(RKNPU_SUB_LIB_PATH "lib64")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(RKNPU_SUB_LIB_PATH "lib64")
+endif()
+
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+    set(RKNPU_SUB_LIB_PATH "lib")
+endif()
+
+find_library(RKNPU_DDK_FILE NAMES rknpu_ddk
+  PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH})
+
+if(NOT RKNPU_DDK_FILE)
+  message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}")
+else()
+  message(STATUS "Found RKNPU_DDK_FILE  Library: ${RKNPU_DDK_FILE}")
+  add_library(rknpu_ddk  SHARED IMPORTED GLOBAL)
+  set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE})
+endif()
+
+set(rknpu_runtime_libs rknpu_ddk  CACHE INTERNAL "rknpu ddk runtime libs")
diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake
index 099833ee4cf80968671036cffe89329506bbf091..823048552f3cb5f05375e97e94cd5b5ad63e7563 100644
--- a/cmake/device/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT)
     message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
   endif()
 endif()
-
 message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
-find_path(XPU_SDK_INC NAMES xtcl.h
-  PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_INC)
-  message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
-endif()
 
-include_directories("${XPU_SDK_ROOT}/XTCL/include")
 include_directories("${XPU_SDK_ROOT}/XTDK/include")
 
-find_library(XPU_SDK_XTCL_FILE NAMES xtcl
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-
-if(NOT XPU_SDK_XTCL_FILE)
-  message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
-  add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
-endif()
-
-find_library(XPU_SDK_TVM_FILE NAMES tvm
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-
-if(NOT XPU_SDK_TVM_FILE)
-  message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
-  add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
-endif()
-
 find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
   PATHS ${XPU_SDK_ROOT}/XTDK/shlib
   NO_DEFAULT_PATH)
@@ -82,23 +50,55 @@ else()
   set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
 endif()
 
-find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
-  NO_DEFAULT_PATH)
-
-find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
-  NO_DEFAULT_PATH)
-
-if(NOT XPU_SDK_LLVM_FILE)
-  message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
-  add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs")
+set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs")
+
+if(LITE_WITH_XTCL)
+    find_path(XPU_SDK_INC NAMES xtcl.h
+      PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_INC)
+      message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
+    endif()
+    include_directories("${XPU_SDK_ROOT}/XTCL/include")
+
+    find_library(XPU_SDK_XTCL_FILE NAMES xtcl
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
+      NO_DEFAULT_PATH)
+
+    if(NOT XPU_SDK_XTCL_FILE)
+      message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
+      add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
+    endif()
+
+    find_library(XPU_SDK_TVM_FILE NAMES tvm
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
+      NO_DEFAULT_PATH)
+
+    if(NOT XPU_SDK_TVM_FILE)
+      message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
+      add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
+    endif()
+
+    find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
+      PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+      NO_DEFAULT_PATH)
+
+    if(NOT XPU_SDK_LLVM_FILE)
+      message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
+      add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+    endif()
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
+
+    set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
+    set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
 endif()
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0")
-
-set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
-set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 599e7bba7eaf12da7506ce44e706bd9f50ec6998..5a757659bb036ca99326bc40cc075f761ba6e641 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -36,7 +36,16 @@ else()
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
         GIT_TAG
-        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        ######################################################################################################
+        # url address of eigen before v2.3.0
+        # URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        ######################################################################################################
+        # url address of eigen since  v2.6.0
+        #         github address: https://github.com/eigenteam/eigen-git-mirror
+        # we changed the source code to adapt for windows compiling
+        #         git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
+        ######################################################################################################
+        URL             https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
         DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
         DOWNLOAD_NO_PROGRESS  1
         PREFIX          ${EIGEN_SOURCE_DIR}
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 142fce816de4f06aa0a36b91e3e4ecb962a8dc2a..8d094d6e064fe57b170d1a50a5457c104d3c3ac2 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,12 +16,6 @@ IF(NOT ${WITH_MKLML})
   return()
 ENDIF(NOT ${WITH_MKLML})
 
-IF(APPLE)
-    MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
-    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
-    return()
-ENDIF()
-
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@@ -38,7 +32,17 @@ IF(WIN32)
     SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
     SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
+    SET(MKLML_SHARED_LIB_DEPS     ${MKLML_LIB_DIR}/msvcr120.dll)
     SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
+ELSEIF(APPLE)
+    #TODO(intel-huying):
+    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
+    SET(MKLML_VER "mklml_mac_2019.0.5.20190502" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlelite-data.bj.bcebos.com/third_party_libs/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml.dylib)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.dylib)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml.dylib)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.dylib)
 ELSE()
     #TODO(intel-huying):
     #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index ae99f4df9a3676ae8f5b2c4c01305ead9b7a8254..57e332f1c103b28a194670de609ee521aa41cdf3 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -70,10 +70,10 @@ SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 SET(py_env "")
 IF(PYTHONINTERP_FOUND)
     find_python_module(pip REQUIRED)
-    find_python_module(numpy REQUIRED)
+    #find_python_module(numpy REQUIRED)
     #find_python_module(wheel REQUIRED)
     #find_python_module(google.protobuf REQUIRED)
-    FIND_PACKAGE(NumPy REQUIRED)
+    #FIND_PACKAGE(NumPy REQUIRED)
     #IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
     #    MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
     #    "please use pip to upgrade protobuf. pip install -U protobuf")
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 225a3c19a16435c4df6403ff7d1bdd01e628dd72..d859404d559282970d96a735c400f745481e8efa 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -276,7 +276,7 @@ function(cc_library TARGET_NAME)
         add_dependencies(${TARGET_NAME} mklml)
         if(WIN32)
           target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
-        else(WIN32)
+        elseif(NOT APPLE)
           target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
         endif(WIN32)
       endif()
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index d16e7af3d7a61fff0ef13cf7cfcbd7af542e7c3f..9a633409cd4d1c5e650a4794fcf30b9154c8638a 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
   cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(deps ${lite_deps_DEPS})
@@ -88,6 +88,12 @@ function (lite_deps TARGET)
     endforeach(var)
   endif()
 
+  if (LITE_WITH_RKNPU)
+    foreach(var ${lite_deps_RKNPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
   if (LITE_WITH_XPU)
     foreach(var ${lite_deps_XPU_DEPS})
       set(deps ${deps} ${var})
@@ -131,7 +137,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
     set(options SHARED shared STATIC static MODULE module)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
       HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -142,6 +148,7 @@ function(lite_cc_library TARGET)
             CUDA_DEPS ${args_CUDA_DEPS}
             CL_DEPS ${args_CL_DEPS}
             BM_DEPS ${args_BM_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             CV_DEPS ${args_CV_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
@@ -161,8 +168,10 @@ function(lite_cc_library TARGET)
     else()
         cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
     endif()
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
 
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
     # collect targets need to compile for lite
     if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS)
         add_dependencies(lite_compile_deps ${TARGET})
@@ -177,7 +186,7 @@ function(lite_cc_binary TARGET)
         set(options " -g ")
     endif()
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
       LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -191,7 +200,8 @@ function(lite_cc_binary TARGET)
             FPGA_DEPS ${args_FPGA_DEPS}
             NPU_DEPS ${args_NPU_DEPS}
             XPU_DEPS ${args_XPU_DEPS}
-	    BM_DEPS ${args_BM_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
+            BM_DEPS ${args_BM_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
@@ -199,7 +209,9 @@ function(lite_cc_binary TARGET)
             MLU_DEPS ${args_MLU_DEPS}
             )
     cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
     if (NOT APPLE)
         # strip binary target to reduce size
         if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
@@ -226,7 +238,7 @@ function(lite_cc_test TARGET)
     endif()
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
         ARGS
         COMPILE_LEVEL # (basic|extra)
@@ -248,7 +260,8 @@ function(lite_cc_test TARGET)
               FPGA_DEPS ${args_FPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
@@ -263,7 +276,9 @@ function(lite_cc_test TARGET)
                 "${TARGET}"
                 COMMENT "Strip debug symbols done on final executable file.")
     endif()
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
     file(APPEND ${offline_test_registry_file} "${TARGET}\n")
 
     # collect targets need to compile for lite
@@ -280,6 +295,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
+set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")
 
@@ -295,12 +311,12 @@ if(LITE_BUILD_TAILOR)
   file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
+# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -317,9 +333,18 @@ function(add_kernel TARGET device level)
     if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
         return()
     endif()
+    if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
+        return()
+    endif()
 
 
     if ("${device}" STREQUAL "Host")
+       if (LITE_ON_MODEL_OPTIMIZE_TOOL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
         set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "ARM")
@@ -332,16 +357,11 @@ function(add_kernel TARGET device level)
         set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "X86")
-        if (NOT LITE_WITH_X86)
+        if (NOT LITE_WITH_X86 OR LITE_ON_MODEL_OPTIMIZE_TOOL)
             foreach(src ${args_SRCS})
                 file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
             endforeach()
             return()
-        elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
-            foreach(src ${args_SRCS})
-                file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
-            endforeach()
-            return()
         endif()
         set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
@@ -381,8 +401,20 @@ function(add_kernel TARGET device level)
         endif()
         set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
+    if ("${device}" STREQUAL "RKNPU")
+        if (NOT LITE_WITH_RKNPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
     if ("${device}" STREQUAL "MLU")
         if (NOT LITE_WITH_MLU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
@@ -426,7 +458,8 @@ function(add_kernel TARGET device level)
               FPGA_DEPS ${args_FPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
               MLU_DEPS ${args_MLU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -451,11 +484,13 @@ function(add_operator TARGET level)
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-
     if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
         return()
     endif()
 
+    if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
+        return()
+    endif()
 
     foreach(src ${args_SRCS})
       if(LITE_BUILD_TAILOR)
@@ -478,7 +513,8 @@ function(add_operator TARGET level)
               FPGA_DEPS ${args_FPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
               MLU_DEPS ${args_MLU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -486,6 +522,29 @@ function(add_operator TARGET level)
       )
 endfunction()
 
+#only for windows 
+function(create_static_lib TARGET_NAME)
+  set(libs ${ARGN})
+  list(REMOVE_DUPLICATES libs)
+    set(dummy_index 1)
+    set(dummy_offset 1)
+    # the dummy target would be consisted of limit size libraries
+    set(dummy_limit 60)
+    list(LENGTH libs libs_len)
+
+    foreach(lib ${libs})
+      list(APPEND dummy_list ${lib})
+      list(LENGTH dummy_list listlen)
+      if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len}))
+        merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list})
+        set(dummy_list)
+        list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index})
+        MATH(EXPR dummy_index "${dummy_index}+1")
+      endif()
+      MATH(EXPR dummy_offset "${dummy_offset}+1")
+    endforeach()
+    merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list})
+endfunction()
 
 # Bundle several static libraries into one.
 function(bundle_static_library tgt_name bundled_tgt_name fake_target)
@@ -529,7 +588,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target)
   set(bundled_tgt_full_name
     ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX})
 
-  #message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}")
+  message(STATUS "bundled_tgt_full_name:  ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  
+  if(WIN32)
+    set(dummy_tgt_name dummy_${bundled_tgt_name})
+    create_static_lib(${bundled_tgt_name} ${static_libs})
+    add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_name})
+    add_dependencies(${fake_target} ${tgt_name})
+  
+    add_library(${dummy_tgt_name} STATIC IMPORTED)
+    set_target_properties(${dummy_tgt_name}
+      PROPERTIES
+        IMPORTED_LOCATION ${bundled_tgt_full_name}
+        INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:${tgt_name},INTERFACE_INCLUDE_DIRECTORIES>)
+    add_dependencies(${dummy_tgt_name} ${fake_target})
+    return()
+  endif()
 
   if(NOT IOS)
     file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 12dd17c5a302259fb8f903735115106526716194..98c01ae92523593b075ac2335f620a63f52260fd 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -7,7 +7,9 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
 message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
+message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
+message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
@@ -75,6 +77,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (LITE_WITH_BM)
         set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
     endif(LITE_WITH_BM)
+    if (LITE_WITH_RKNPU)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu")
+    endif(LITE_WITH_RKNPU)
 else()
     set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
 endif()
@@ -82,16 +87,59 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
 
 # add python lib
 if (LITE_WITH_PYTHON)
-    add_custom_target(publish_inference_python_lib ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
-            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    if(WIN32)   
+        set(LITE_CORE "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd")
+        set(LITE_CORE_DEPS ${LITE_CORE})
+        add_custom_command(OUTPUT   ${LITE_CORE}
+            COMMAND cmake -E copy $<TARGET_FILE:lite_pybind> ${LITE_CORE}
+            DEPENDS lite_pybind)
+        add_custom_target(copy_lite_pybind ALL DEPENDS ${LITE_CORE_DEPS})
+        
+        add_custom_target(publish_inference_python_lib ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.pyd"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.pyd"
+            DEPENDS copy_lite_pybind
+            )
+            
+        add_custom_target(publish_inference_python_installer ${TARGET}
+            COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
+            DEPENDS  publish_inference_python_lib)
+        add_custom_target(publish_inference_python_light_demo ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/python"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_full_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
+            )
+        add_dependencies(publish_inference publish_inference_python_lib)
+        add_dependencies(publish_inference publish_inference_python_installer)
+        add_dependencies(publish_inference publish_inference_python_light_demo)
+    else()
+    if(APPLE)
+        add_custom_target(publish_inference_python_lib ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    else()
+        add_custom_target(publish_inference_python_lib ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    endif()
     add_custom_target(publish_inference_python_installer ${TARGET}
-        COMMAND python setup.py bdist_wheel
+        COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
         WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
         DEPENDS publish_inference_python_lib)
     add_custom_target(publish_inference_python_light_demo ${TARGET}
@@ -107,10 +155,27 @@ if (LITE_WITH_PYTHON)
     add_dependencies(publish_inference publish_inference_python_lib)
     add_dependencies(publish_inference publish_inference_python_installer)
     add_dependencies(publish_inference publish_inference_python_light_demo)
+    endif(WIN32)
 endif()
 
-if (LITE_WITH_X86)
-    add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
+if (LITE_WITH_CUDA OR LITE_WITH_X86)
+    if(APPLE)
+        add_custom_target(publish_inference_cxx_lib ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.dylib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            )
+        add_custom_target(publish_inference_third_party ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+                COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+        add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
+        add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
+        add_dependencies(publish_inference publish_inference_cxx_lib)
+        add_dependencies(publish_inference publish_inference_third_party)
+    elseif(NOT WIN32)
+        add_custom_target(publish_inference_cxx_lib ${TARGET}
             COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
             COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
             COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
@@ -118,50 +183,76 @@ if (LITE_WITH_X86)
             COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
             COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
             COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            )
+        add_custom_target(publish_inference_third_party ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+                COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+        add_dependencies(publish_inference_cxx_lib bundle_full_api)
+        add_dependencies(publish_inference_cxx_lib bundle_light_api)
+        add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
+        add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
+        add_dependencies(publish_inference publish_inference_cxx_lib)
+        add_dependencies(publish_inference publish_inference_third_party)
+    endif()
+endif()
+
+if (LITE_WITH_X86)
+  if(WIN32)
+        add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_place.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_passes.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_lite_factory_helper.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_full_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_light_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        )
+
+        add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
+        add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
+        add_dependencies(publish_inference publish_inference_x86_cxx_lib)
+
+        add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+        )
+        add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
+        add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3)
+
+  else()
+
+    add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
             COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
             )
-    add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
-    add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
     add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
-    add_dependencies(publish_inference_x86_cxx_lib paddle_full_api_shared)
-    add_dependencies(publish_inference_x86_cxx_lib paddle_light_api_shared)
-    add_dependencies(publish_inference publish_inference_x86_cxx_lib)
 
     add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
            COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            )
     add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
     add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
+    add_dependencies(publish_inference publish_inference_x86_cxx_lib)
+    add_dependencies(publish_inference publish_inference_x86_cxx_demos)
+  endif()
 endif()
 
 if(LITE_WITH_CUDA)
-    add_custom_target(publish_inference_cuda_cxx_lib ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-        )
-    add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api)
-    add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api)
-    add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared)
-    add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared)
-    add_dependencies(publish_inference publish_inference_cuda_cxx_lib)
-
     add_custom_target(publish_inference_cuda_cxx_demos ${TARGET}
-           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            )
-    add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos)
     add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared)
-endif(LITE_WITH_CUDA) 
+    add_dependencies(publish_inference publish_inference_cuda_cxx_demos)
+endif(LITE_WITH_CUDA)
+
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (NOT LITE_ON_TINY_PUBLISH)
         # add cxx lib
@@ -193,7 +284,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 add_dependencies(publish_inference publish_inference_cxx_lib)
                 if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
                     add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
-                            COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
+                            COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a
+                            COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.so)
                 endif()
             endif()
     else()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 2a93331f4ac179cc35acb65bd9271c68a93d71ad..763b988653b60ec02b54200b232be6b79f41d357 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -8,39 +8,48 @@ if (LITE_ON_TINY_PUBLISH)
     set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
     set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
 endif()
-set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+
+set(light_lib_DEPS light_api paddle_api paddle_api_light)
+
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
     #full api dynamic library
-    add_library(paddle_full_api_shared SHARED "")
-    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
+    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
+                  DEPS paddle_api paddle_api_light  paddle_api_full)
     add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
     target_link_libraries(paddle_full_api_shared framework_proto)
     if(LITE_WITH_X86)
         add_dependencies(paddle_full_api_shared xxhash)
         target_link_libraries(paddle_full_api_shared xxhash)
-        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) 
+        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
             add_dependencies(paddle_full_api_shared dynload_mklml)
         endif()
+        if(WIN32)
+             target_link_libraries(paddle_full_api_shared shlwapi.lib)
+        endif()
     endif()
     if(LITE_WITH_CUDA)
         target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
     endif(LITE_WITH_CUDA)
 
     #light api dynamic library
-    lite_cc_library(paddle_light_api_shared MODULE
-        SRCS light_api_shared.cc
-        DEPS ${light_lib_DEPS}
-        ARM_DEPS ${arm_kernels}
-        CV_DEPS paddle_cv_arm
-        NPU_DEPS ${npu_kernels})
-
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
-    set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
-    add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
-    add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
-    set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
-    add_dependencies(paddle_full_api_shared custom_linker_map)
+    lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc
+                  DEPS ${light_lib_DEPS}
+                  ARM_DEPS ${arm_kernels}
+                  CV_DEPS paddle_cv_arm
+                  NPU_DEPS ${npu_kernels}
+                  RKNPU_DEPS ${rknpu_kernels}
+                  )
+
+    add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels})
+    if(NOT APPLE AND NOT WIN32)
+        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
+        set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
+        add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
+        add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
+        set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
+        add_dependencies(paddle_full_api_shared custom_linker_map)
+   endif()
 else()
     if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
         add_library(paddle_light_api_shared SHARED "")
@@ -55,6 +64,11 @@ else()
             # Need to add HIAI runtime libs (libhiai.so) dependency
             target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
         endif()
+        if (LITE_WITH_RKNPU)
+            # Need to add RKNPU runtime libs dependency
+            target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs})
+        endif()
+
     endif()
 endif()
 
@@ -65,6 +79,7 @@ if (WITH_TESTING)
       CUDA_DEPS ${cuda_kernels}
       X86_DEPS ${x86_kernels}
       XPU_DEPS ${xpu_kernels}
+      RKNPU_DEPS ${rknpu_kernels}
       BM_DEPS ${bm_kernels}
       MLU_DEPS ${mlu_kernels})
 endif()
@@ -78,6 +93,12 @@ if(LITE_WITH_BM)
     set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
 endif()
 
+if(LITE_WITH_RKNPU)
+    set(light_api_deps ${light_api_deps} ${rknpu_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
+endif()
+
+
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
 message(STATUS "get CUDA kernels ${cuda_kernels}")
@@ -86,6 +107,7 @@ message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get OpenCL kernels ${opencl_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
+message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
 message(STATUS "get MLU kernels ${mlu_kernels}")
@@ -103,6 +125,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                         CV_DEPS paddle_cv_arm
                         NPU_DEPS ${npu_kernels}
                         XPU_DEPS ${xpu_kernels}
+                        RKNPU_DEPS ${rknpu_kernels}
                         BM_DEPS ${bm_kernels}
                         CL_DEPS ${opencl_kernels}
                         FPGA_DEPS ${fpga_kernels})
@@ -124,6 +147,7 @@ lite_cc_library(light_api SRCS light_api.cc
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         BM_DEPS ${bm_kernels}
@@ -143,6 +167,7 @@ if(WITH_TESTING)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+       RKNPU_DEPS ${rknpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
@@ -188,7 +213,11 @@ if(WITH_TESTING)
            lite_cc_test(test_classify_lite_bm SRCS test_classify_lite_bm.cc
               DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
               ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
-              ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+              ARGS --model_dir=${LITE_MODEL_DIR}/classify)
+           lite_cc_test(test_yolov3_lite_bm SRCS test_yolov3_lite_bm.cc
+              DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+              ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
+              ARGS --model_dir=${LITE_MODEL_DIR}/yolov3)
         endif()
     endif()
 endif()
@@ -240,6 +269,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
             --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
     add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
+
    # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
    # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
    #    DEPS ${lite_model_test_DEPS})
@@ -266,7 +296,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         CL_DEPS ${opencl_kernels}
-        FPGA_DEPS ${fpga_kernels})
+        FPGA_DEPS ${fpga_kernels}
+        BM_DEPS ${bm_kernels})
     # The final inference library for just MobileConfig.
     bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
     target_link_libraries(paddle_api_full ${cuda_deps})
@@ -282,6 +313,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
         DEPS light_api program mir_passes paddle_api_light
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         BM_DEPS ${bm_kernels}
         ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
 
@@ -291,6 +323,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
         X86_DEPS ${x86_kernels}
         XPU_DEPS ${xpu_kernels}
         FPGA_DEPS ${fpga_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         BM_DEPS ${bm_kernels}
         MLU_DEPS ${mlu_kernels}
         ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
@@ -316,7 +349,7 @@ add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_
 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
     message(STATUS "Compiling opt")
     lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
-        DEPS gflags kernel op optimizer mir_passes utils)
+        DEPS gflags kernel op optimizer mir_passes utils ${host_kernels})
     add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 
@@ -326,6 +359,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
   CV_DEPS paddle_cv_arm
   NPU_DEPS ${npu_kernels}
   XPU_DEPS ${xpu_kernels}
+  RKNPU_DEPS ${rknpu_kernels}
   CL_DEPS ${opencl_kernels}
   X86_DEPS ${x86_kernels}
   FPGA_DEPS ${fpga_kernels}
@@ -347,6 +381,7 @@ if(NOT IOS)
         MLU_DEPS ${mlu_kernels}
         CL_DEPS ${opencl_kernels}
         BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
@@ -360,6 +395,7 @@ if(NOT IOS)
         MLU_DEPS ${mlu_kernels}
         CL_DEPS ${opencl_kernels}
         BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
@@ -373,6 +409,7 @@ if(NOT IOS)
         MLU_DEPS ${mlu_kernels}
         CL_DEPS ${opencl_kernels}
         BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
@@ -383,6 +420,7 @@ if(NOT IOS)
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         MLU_DEPS ${mlu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
@@ -395,17 +433,20 @@ if(NOT IOS)
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         MLU_DEPS ${mlu_kernels}
         CL_DEPS ${opencl_kernels}
 	BM_DEPS ${bm_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
+
     lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
         ARM_DEPS ${arm_kernels}
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
+        RKNPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
diff --git a/lite/api/_paddle_use_ops.h b/lite/api/_paddle_use_ops.h
index 6da47e53789d651f4a36d0b8d6a7ca1ea5a0a3d3..778b4dc7a8d19bc07d641e2923234d84c59099c5 100644
--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
@@ -63,6 +63,7 @@ USE_LITE_OP(swish)
 USE_LITE_OP(log)
 USE_LITE_OP(exp)
 USE_LITE_OP(conv2d_transpose)
+USE_LITE_OP(depthwise_conv2d_transpose)
 USE_LITE_OP(negative)
 USE_LITE_OP(pad2d)
 USE_LITE_OP(power)
diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc
index d53de7bf2ed00fed70bbd1f70729a051e5d7203b..0ce7f6f0d5aa5bb5c7bc66dbeddaa618fa6466e6 100644
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -13,7 +13,13 @@
 // limitations under the License.
 
 #include <gflags/gflags.h>
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#include <windows.h>
+#include "lite/backends/x86/port.h"
+#endif
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include <time.h>
 #include <algorithm>
 #include <cstdio>
@@ -27,6 +33,9 @@
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 
+DEFINE_string(optimized_model_path,
+              "",
+              "the path of the model that is optimized by opt.");
 DEFINE_string(model_dir,
               "",
               "the path of the model, the model and param files is under "
@@ -44,7 +53,10 @@ DEFINE_string(input_shape,
               "set input shapes according to the model, "
               "separated by colon and comma, "
               "such as 1,3,244,244");
-DEFINE_string(input_img_path, "", "the path of input image");
+DEFINE_string(input_img_path,
+              "",
+              "the path of input image, if not set "
+              "input_img_path, the input of model will be 1.0.");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
 DEFINE_int32(power_mode,
@@ -57,16 +69,8 @@ DEFINE_int32(power_mode,
 DEFINE_int32(threads, 1, "threads num");
 DEFINE_string(result_filename,
               "result.txt",
-              "save benchmark "
-              "result to the file");
-DEFINE_bool(run_model_optimize,
-            false,
-            "if set true, apply model_optimize_tool to "
-            "model and use optimized model to test. ");
-DEFINE_bool(is_quantized_model,
-            false,
-            "if set true, "
-            "test the performance of the quantized model. ");
+              "save the inference time to the file.");
+DEFINE_bool(show_output, false, "Wether to show the output in shell.");
 
 namespace paddle {
 namespace lite_api {
@@ -87,10 +91,6 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
   std::vector<Place> vaild_places = {
       Place{TARGET(kARM), PRECISION(kFloat)},
   };
-  if (FLAGS_is_quantized_model) {
-    vaild_places.insert(vaild_places.begin(),
-                        Place{TARGET(kARM), PRECISION(kInt8)});
-  }
   config.set_valid_places(vaild_places);
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
@@ -106,15 +106,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
   LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
 }
 
+int64_t ShapeProduction(const std::vector<int64_t>& shape) {
+  int64_t num = 1;
+  for (auto i : shape) {
+    num *= i;
+  }
+  return num;
+}
+
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 void Run(const std::vector<int64_t>& input_shape,
-         const std::string& model_dir,
+         const std::string& model_path,
          const std::string model_name) {
   // set config and create predictor
   lite_api::MobileConfig config;
   config.set_threads(FLAGS_threads);
   config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
-  config.set_model_from_file(model_dir + ".nb");
+  config.set_model_from_file(model_path);
 
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
@@ -122,10 +130,7 @@ void Run(const std::vector<int64_t>& input_shape,
   auto input_tensor = predictor->GetInput(0);
   input_tensor->Resize(input_shape);
   auto input_data = input_tensor->mutable_data<float>();
-  int input_num = 1;
-  for (size_t i = 0; i < input_shape.size(); ++i) {
-    input_num *= input_shape[i];
-  }
+  int64_t input_num = ShapeProduction(input_shape);
   if (FLAGS_input_img_path.empty()) {
     for (int i = 0; i < input_num; ++i) {
       input_data[i] = 1.f;
@@ -173,26 +178,78 @@ void Run(const std::vector<int64_t>& input_shape,
   ofs << "average = " << std::setw(12) << avg_res;
   ofs << std::endl;
   ofs.close();
+
+  if (FLAGS_show_output) {
+    auto out_tensor = predictor->GetOutput(0);
+    auto* out_data = out_tensor->data<float>();
+    int64_t output_num = ShapeProduction(out_tensor->shape());
+    float max_value = out_data[0];
+    int max_index = 0;
+    for (int i = 0; i < output_num; i++) {
+      if (max_value < out_data[i]) {
+        max_value = out_data[i];
+        max_index = i;
+      }
+    }
+    LOG(INFO) << "max_value:" << max_value;
+    LOG(INFO) << "max_index:" << max_index;
+    LOG(INFO) << "output data[0:10]:";
+    for (int i = 0; i < 10; i++) {
+      LOG(INFO) << out_data[i];
+    }
+  }
 }
 #endif
 
 }  // namespace lite_api
 }  // namespace paddle
 
+void print_usage() {
+  std::string help_info =
+      "Usage: \n"
+      "./benchmark_bin \n"
+      "  --optimized_model_path (The path of the model that is optimized\n"
+      "    by opt. If the model is optimized, please set the param.) \n"
+      "    type: string \n"
+      "  --model_dir (The path of the model that is not optimized by opt,\n"
+      "    the model and param files is under model_dir.) type: string \n"
+      "  --model_filename (The filename of model file. When the model is\n "
+      "    combined formate, please set model_file. Otherwise, it is not\n"
+      "    necessary to set it.) type: string \n"
+      "  --param_filename (The filename of param file, set param_file when\n"
+      "    the model is combined formate. Otherwise, it is not necessary\n"
+      "    to set it.) type: string \n"
+      "  --input_shape (Set input shapes according to the model, separated by\n"
+      "    colon and comma, such as 1,3,244,244) type: string\n"
+      "    default: 1,3,224,224 \n"
+      "  --input_img_path (The path of input image, if not set\n"
+      "    input_img_path, the input will be 1.0.) type: string \n "
+      "  --power_mode (Arm power mode: 0 for big cluster, 1 for little\n"
+      "    cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n"
+      "  --repeats (Repeats times) type: int32 default: 1 \n"
+      "  --result_filename (Save the inference time to the file.) type: \n"
+      "    string default: result.txt \n"
+      "  --threads (Threads num) type: int32 default: 1 \n"
+      "  --warmup (Warmup times) type: int32 default: 0 \n"
+      "Note that: \n"
+      "  If load the optimized model, set optimized_model_path. Otherwise, \n"
+      "    set model_dir, model_filename and param_filename according to \n"
+      "    the model. \n";
+  LOG(INFO) << help_info;
+}
+
 int main(int argc, char** argv) {
+  // Check inputs
   gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_model_dir == "" || FLAGS_result_filename == "") {
-    LOG(INFO) << "please run ./benchmark_bin --help to obtain usage.";
+  bool is_opt_model = (FLAGS_optimized_model_path != "");
+  bool is_origin_model = (FLAGS_model_dir != "");
+  if (!is_origin_model && !is_opt_model) {
+    LOG(INFO) << "Input error, the model path should not be empty.\n";
+    print_usage();
     exit(0);
   }
 
-  if (FLAGS_model_dir.back() == '/') {
-    FLAGS_model_dir.pop_back();
-  }
-  std::size_t found = FLAGS_model_dir.find_last_of("/");
-  std::string model_name = FLAGS_model_dir.substr(found + 1);
-  std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2";
-
+  // Get input shape
   auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
     std::vector<int64_t> shape;
     std::string tmp_str = str_shape;
@@ -208,19 +265,31 @@ int main(int argc, char** argv) {
     }
     return shape;
   };
-
   std::vector<int64_t> input_shape = get_shape(FLAGS_input_shape);
 
-  // Output optimized model if needed
-  if (FLAGS_run_model_optimize) {
-    paddle::lite_api::OutputOptModel(save_optimized_model_dir);
+  // Get model_name and run_model_path
+  std::string model_name;
+  std::string run_model_path;
+  if (is_origin_model) {
+    if (FLAGS_model_dir.back() == '/') {
+      FLAGS_model_dir.pop_back();
+    }
+    std::size_t found = FLAGS_model_dir.find_last_of("/");
+    model_name = FLAGS_model_dir.substr(found + 1);
+    std::string optimized_model_path = FLAGS_model_dir + "_opt2";
+    paddle::lite_api::OutputOptModel(optimized_model_path);
+    run_model_path = optimized_model_path + ".nb";
+  } else {
+    size_t found1 = FLAGS_optimized_model_path.find_last_of("/");
+    size_t found2 = FLAGS_optimized_model_path.find_last_of(".");
+    size_t len = found2 - found1 - 1;
+    model_name = FLAGS_optimized_model_path.substr(found1 + 1, len);
+    run_model_path = FLAGS_optimized_model_path;
   }
 
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  // Run inference using optimized model
-  std::string run_model_dir =
-      FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
-  paddle::lite_api::Run(input_shape, run_model_dir, model_name);
+  // Run test
+  paddle::lite_api::Run(input_shape, run_model_path, model_name);
 #endif
   return 0;
 }
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index 556a9e0af01854ff5c57a14dade72b81ed255964..f4dcac519a0699cbcf1bdd3845d8ae90d7a289ed 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -19,6 +19,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/api/paddle_use_passes.h"
 #include "lite/utils/io.h"
 
 namespace paddle {
@@ -291,10 +292,13 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
   program_desc_ = desc;
   // `inner_places` is used to optimize passes
   std::vector<Place> inner_places = valid_places;
-  inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
-  inner_places.emplace_back(
-      TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  for (auto &valid_place : valid_places) {
+    inner_places.emplace_back(
+        Place(TARGET(kHost), valid_place.precision, valid_place.layout));
+  }
 
+  // Analysis whether the modle is quantized.
+  // For quantized model, add place(arm, int8) to inner_places
   const std::vector<std::string> quant_dequant_op = {
       "fake_quantize_abs_max",
       "fake_quantize_range_abs_max",
@@ -317,7 +321,8 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
     }
   }
   if (is_quantized_model) {
-    inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)});
+    inner_places.insert(inner_places.begin(),
+                        Place{TARGET(kARM), PRECISION(kInt8)});
   }
 
   Program program(desc, scope_, inner_places);
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
index e63893cb91e112beb6be50bd661a57b9738e5fb1..146556756af7e0b56ae38b5303e622c97dfe58af 100644
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -43,6 +43,7 @@ class LITE_API Predictor {
  public:
   // Create an empty predictor.
   Predictor() { scope_ = std::make_shared<Scope>(); }
+
   // Create a predictor with the weight variable scope set.
   explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
       : scope_(root_scope) {}
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index d4da1c429b5f66085b659047636383ecd546d937..fc1a0648c0fd4e50621cfaf75495da6df6ccd86e 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -20,19 +20,35 @@
 #include "lite/core/device_info.h"
 #include "lite/core/version.h"
 
+#ifndef LITE_ON_TINY_PUBLISH
+#include "lite/api/paddle_use_passes.h"
+#endif
+
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
-    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
 #include <omp.h>
 #include "lite/backends/x86/mklml.h"
 #endif
-
 namespace paddle {
 namespace lite {
 
 void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   config_ = config;
+  auto places = config.valid_places();
+  std::vector<std::string> passes{};
 #ifdef LITE_WITH_CUDA
-  Env<TARGET(kCUDA)>::Init();
+  // if kCUDA is included in valid places, it should be initialized first,
+  // otherwise skip this step.
+  for (auto &p : places) {
+    if (p.target == TARGET(kCUDA)) {
+      Env<TARGET(kCUDA)>::Init();
+      if (config_.multi_stream()) {
+        passes = {"multi_stream_analysis_pass"};
+        VLOG(3) << "add pass: " << passes[0];
+      }
+      break;
+    }
+  }
 #endif
 #ifdef LITE_WITH_MLU
   Env<TARGET(kMLU)>::Init();
@@ -43,8 +59,6 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
                                         config.mlu_first_conv_std(),
                                         config.mlu_input_layout());
 #endif  // LITE_WITH_MLU
-  auto places = config.valid_places();
-  std::vector<std::string> passes{};
   auto use_layout_preprocess_pass =
       config.model_dir().find("OPENCL_PRE_PRECESS");
   VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
@@ -56,9 +70,8 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   raw_predictor_.Build(config, places, passes);
   mode_ = config.power_mode();
   threads_ = config.threads();
-
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
-    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
   int num_threads = config.x86_math_library_num_threads();
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index b641973a15b2e6abc1cf4c999d759271f7522638..7a7f870a9ac38e4103f3f8a7c6b95a98bb6722db 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -13,13 +13,10 @@
 // limitations under the License.
 
 #include "lite/api/light_api.h"
+#include <algorithm>
+#include <unordered_map>
 #include "paddle_use_kernels.h"  // NOLINT
 #include "paddle_use_ops.h"      // NOLINT
-#ifndef LITE_ON_TINY_PUBLISH
-#include "lite/api/paddle_use_passes.h"
-#endif
-
-#include <algorithm>
 
 namespace paddle {
 namespace lite {
@@ -32,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file,
     LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
   }
 
+  // For weight quantization of post training, load the int8/16 weights
+  // for optimized model, and dequant it to fp32.
   DequantizeWeight();
+
   BuildRuntimeProgram(cpp_program_desc_);
   PrepareFeedFetch();
 }
@@ -139,7 +139,12 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
   // 1. Create op first
   Program program(prog, scope_, {});
 
-  // 2. Create Instructs
+// 2. Create Instructs
+#ifdef LITE_WITH_OPENCL
+  using OpenCLContext = Context<TargetType::kOpenCL>;
+  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
+  local_ctx->As<OpenCLContext>().InitOnce();
+#endif
 
   // Create the kernels of the target places, and filter out the specific
   // kernel with the target alias.
@@ -155,7 +160,18 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
           return it->alias() == alias;
         });
     CHECK(it != kernels.end());
+
+#ifdef LITE_WITH_OPENCL
+    if ((*it)->target() == TARGET(kOpenCL)) {
+      std::unique_ptr<KernelContext> ctx(new KernelContext());
+      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
+      (*it)->SetContext(std::move(ctx));
+    } else {
+      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+    }
+#else
     (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+#endif
 
     insts.emplace_back(op, std::move(*it));
   }
@@ -166,58 +182,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
 }
 
 void LightPredictor::DequantizeWeight() {
-#define PROCESS_CONV2D_DATA()                                   \
-  for (int64_t i = 0; i < h; ++i) {                             \
-    for (int64_t j = 0; j < w; ++j) {                           \
-      fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \
-    }                                                           \
+#define PROCESS_CONV2D_DATA()                                             \
+  for (int64_t i = 0; i < ch; ++i) {                                      \
+    for (int64_t j = 0; j < offset; ++j) {                                \
+      fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \
+    }                                                                     \
   }
 
-#define PROCESS_FC_DATA()                           \
-  for (int i = 0; i < input_tensor->numel(); i++) { \
-    *fp_data = scale_list[0] * (*int_data);         \
-    ++fp_data;                                      \
-    ++int_data;                                     \
+#define PROCESS_FC_DATA()                                               \
+  for (int64_t i = 0; i < chin; i++) {                                  \
+    for (int64_t j = 0; j < chout; j++) {                               \
+      fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \
+    }                                                                   \
   }
 
+  auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) {
+    bool result = false;
+    if (op_desc->HasAttr("quantization_type")) {
+      std::string type = op_desc->GetAttr<std::string>("quantization_type");
+      result = (type == "post_weight_abs_max") ||
+               (type == "post_weight_channel_wise_abs_max");
+    } else {
+      result = op_desc->HasAttr("quantize_weight_bits");
+    }
+    return result;
+  };
+
   Tensor tmp_tensor;
-  CHECK(cpp_program_desc_.BlocksSize());
-  auto* main_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
-  for (size_t k = 0; k < main_block->OpsSize(); ++k) {
-    auto* op_desc = main_block->GetOp<cpp::OpDesc>(k);
-    if (op_desc->HasAttr("quantize_weight_bits")) {  //  weight quantized op
-      auto input_names = op_desc->input_vars();
-      for (auto& input_name : input_names) {
-        std::string input_scale_name = input_name + "_quant_scale";
-        if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
-          auto input_tensor =
-              scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
-          tmp_tensor.CopyDataFrom(*input_tensor);
-          auto scale_list =
-              op_desc->GetAttr<std::vector<float>>(input_scale_name);
-          int quantize_weight_bits =
-              op_desc->GetAttr<int>("quantize_weight_bits");
-          float* fp_data = input_tensor->mutable_data<float>();
-
-          std::string op_type = op_desc->Type();
-          if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
-            int64_t h = input_tensor->dims()[0];
-            int64_t w = input_tensor->numel() / h;
-            CHECK_EQ(scale_list.size(), h);
-            if (quantize_weight_bits == 8) {
-              const int8_t* int_data = tmp_tensor.data<int8_t>();
-              PROCESS_CONV2D_DATA()
-            } else {
-              const int16_t* int_data = tmp_tensor.data<int16_t>();
-              PROCESS_CONV2D_DATA()
-            }
-          } else if (op_type == "fc" || op_type == "mul") {
-            if (quantize_weight_bits == 8) {
-              const int8_t* int_data = tmp_tensor.data<int8_t>();
-              PROCESS_FC_DATA()
-            } else {
-              const int16_t* int_data = tmp_tensor.data<int16_t>();
-              PROCESS_FC_DATA()
+  for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
+    auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
+    for (size_t k = 0; k < block->OpsSize(); ++k) {
+      auto* op_desc = block->GetOp<cpp::OpDesc>(k);
+      if (is_weight_quantized_op(op_desc)) {
+        auto input_names = op_desc->input_vars();
+        for (auto& input_name : input_names) {
+          std::string input_scale_name = input_name + "_quant_scale";
+          if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
+            auto input_tensor =
+                scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
+            tmp_tensor.CopyDataFrom(*input_tensor);
+            auto scale_list =
+                op_desc->GetAttr<std::vector<float>>(input_scale_name);
+
+            int quantize_weight_bits =
+                op_desc->GetAttr<int>("quantize_weight_bits");
+            CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16);
+            float* fp_data = input_tensor->mutable_data<float>();
+
+            std::string op_type = op_desc->Type();
+            if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
+              int64_t ch = input_tensor->dims()[0];
+              int64_t offset = input_tensor->numel() / ch;
+              CHECK_EQ(scale_list.size(), ch);
+              if (quantize_weight_bits == 8) {
+                const int8_t* int_data = tmp_tensor.data<int8_t>();
+                PROCESS_CONV2D_DATA()
+              } else {
+                const int16_t* int_data = tmp_tensor.data<int16_t>();
+                PROCESS_CONV2D_DATA()
+              }
+            } else if (op_type == "fc" || op_type == "mul") {
+              int64_t chin = input_tensor->dims()[0];
+              int64_t chout = input_tensor->dims()[1];
+              CHECK_EQ(scale_list.size(), chout);
+              if (quantize_weight_bits == 8) {
+                const int8_t* int_data = tmp_tensor.data<int8_t>();
+                PROCESS_FC_DATA()
+              } else {
+                const int16_t* int_data = tmp_tensor.data<int16_t>();
+                PROCESS_FC_DATA()
+              }
             }
           }
         }
diff --git a/lite/api/light_api_shared.cc b/lite/api/light_api_shared.cc
deleted file mode 100644
index cfe3d9de09a646e33c4a116bb3cd087d28aa24c2..0000000000000000000000000000000000000000
--- a/lite/api/light_api_shared.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/api/paddle_api.h"
-
-namespace paddle {
-namespace lite_api {
-
-void RunModel() {
-  // 1. Set MobileConfig
-  MobileConfig mobile_config;
-
-  // 2. Create PaddlePredictor by MobileConfig
-  std::shared_ptr<PaddlePredictor> mobile_predictor =
-      CreatePaddlePredictor<MobileConfig>(mobile_config);
-}
-
-}  // namespace lite_api
-}  // namespace paddle
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
index 51f9b565196d30520f0cf73ea41a01fed0cc49e8..efad7b74e943c29c9af1af5c14ac51621eefe576 100644
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -23,6 +23,7 @@
 #include "kernel_src_map.h"     // NOLINT
 #include "lite/api/cxx_api.h"
 #include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/core/op_registry.h"
@@ -108,6 +109,10 @@ std::vector<Place> ParserValidPlaces() {
       valid_places.emplace_back(TARGET(kNPU));
     } else if (target_repr == "xpu") {
       valid_places.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "rknpu") {
+      valid_places.emplace_back(TARGET(kRKNPU));
+      valid_places.emplace_back(
+          TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
     } else if (target_repr == "mlu") {
       valid_places.emplace_back(TARGET(kMLU));
     } else {
@@ -186,6 +191,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                       "kFPGA",
                                       "kNPU",
                                       "kXPU",
+                                      "kRKNPU",
                                       "kAny",
                                       "kUnk"};
   int maximum_optype_length = 0;
@@ -250,16 +256,16 @@ void PrintHelpInfo() {
       "        `--param_file=<param_path>`\n"
       "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
       "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`\n"
       "        `--record_tailoring_info=(true|false)`\n"
       "  Arguments of model checking and ops information:\n"
       "        `--print_all_ops=true`   Display all the valid operators of "
       "Paddle-Lite\n"
       "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
       "  Display valid operators of input targets\n"
       "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
       "  Display operators in the input model\n";
   std::cout << "opt version:" << opt_version << std::endl
             << help_info << std::endl;
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index 91edb2cda7849211f288d64e00191ddba8f82f19..daef2c66dda5188a1eec25c3d5f045f1fa705e1e 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/api/paddle_api.h"
+#include "lite/core/context.h"
 #include "lite/core/device_info.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
@@ -203,6 +204,7 @@ void ConfigBase::set_threads(int threads) {
 #endif
 }
 
+#ifdef LITE_WITH_MLU
 void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
   mlu_core_version_ = core_version;
 }
@@ -227,12 +229,32 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
 int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
 DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
 bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
-std::vector<float> CxxConfig::mlu_first_conv_mean() const {
+const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
   return mlu_first_conv_mean_;
 }
-std::vector<float> CxxConfig::mlu_first_conv_std() const {
+const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
   return mlu_first_conv_std_;
 }
+#endif
+
+void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_workspace_l3_size_per_thread' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
+void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetDev(dev_no);
+#else
+  LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
+                  "ignored, please rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
 
 // set model data in combined format, `set_model_from_file` refers to loading
 // model from file, set_model_from_buffer refers to loading model from memory
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 0cb60bf84fe5063287646f825dc74dc5f51bee11..79ab98da799a99540217d55e3d40b46800f17626 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -136,12 +136,17 @@ class LITE_API CxxConfig : public ConfigBase {
 #ifdef LITE_WITH_X86
   int x86_math_library_math_threads_ = 1;
 #endif
+#ifdef LITE_WITH_CUDA
+  bool multi_stream_{false};
+#endif
+#ifdef LITE_WITH_MLU
   lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
   int mlu_core_number_{1};
   DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
   bool mlu_use_first_conv_{false};
   std::vector<float> mlu_first_conv_mean_;
   std::vector<float> mlu_first_conv_std_;
+#endif
 
  public:
   void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -169,20 +174,41 @@ class LITE_API CxxConfig : public ConfigBase {
     return x86_math_library_math_threads_;
   }
 #endif
+#ifdef LITE_WITH_CUDA
+  void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
+  int multi_stream() const { return multi_stream_; }
+#endif
 
+#ifdef LITE_WITH_MLU
+  // set MLU core version, which is used when compiling MLU kernels
   void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
+  // set MLU core number, which is used when compiling MLU kernels
   void set_mlu_core_number(int core_number);
+  // set MLU input layout. User can specify layout of input data to be NHWC,
+  // default is NCHW
   void set_mlu_input_layout(DataLayoutType layout);
+  // whether use MLU's first conv kernel. First conv is a special kernel
+  // provided by MLU, its input is uint8, and also needs two 3-dimentional
+  // vectors which save all inputs' mean and std values
   void set_mlu_use_first_conv(bool use_first_conv);
+  // set the 3-dimentional mean vector used by MLU's first conv
   void set_mlu_first_conv_mean(const std::vector<float>& mean);
+  // set the 3-dimentional std vector used by MLU's first conv
   void set_mlu_first_conv_std(const std::vector<float>& std);
 
   lite_api::MLUCoreVersion mlu_core_version() const;
   int mlu_core_number() const;
   DataLayoutType mlu_input_layout() const;
   bool mlu_use_first_conv() const;
-  std::vector<float> mlu_first_conv_mean() const;
-  std::vector<float> mlu_first_conv_std() const;
+  const std::vector<float>& mlu_first_conv_mean() const;
+  const std::vector<float>& mlu_first_conv_std() const;
+#endif
+
+  // XPU only, set the size of the workspace memory from L3 cache for the
+  // current thread.
+  void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
+  // XPU only, specify the target device ID for the current thread.
+  void set_xpu_dev_per_thread(int dev_no = 0);
 };
 
 /// MobileConfig is the config for the light weight predictor, it will skip
diff --git a/lite/api/paddle_lite_factory_helper.h b/lite/api/paddle_lite_factory_helper.h
index e99127e233bc4adf159a6a567dfb15f6fd784a27..9dc5c9e857243ecb57f785737b00929e36c5d83c 100644
--- a/lite/api/paddle_lite_factory_helper.h
+++ b/lite/api/paddle_lite_factory_helper.h
@@ -18,20 +18,21 @@
  */
 #pragma once
 
-#define USE_LITE_OP(op_type__)                                   \
-  extern int touch_op_##op_type__();                             \
-  int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \
-      touch_op_##op_type__();
+// some platform-independent defintion
+#include "lite/utils/macros.h"
+
+#define USE_LITE_OP(op_type__)       \
+  extern int touch_op_##op_type__(); \
+  int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__();
 
 #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
   extern int touch_##op_type__##target__##precision__##layout__##alias__();  \
   int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \
-      __attribute__((unused)) =                                              \
-          touch_##op_type__##target__##precision__##layout__##alias__();
+      UNUSED = touch_##op_type__##target__##precision__##layout__##alias__();
 
-#define USE_MIR_PASS(name__)                                   \
-  extern bool mir_pass_registry##name__##_fake();              \
-  static bool mir_pass_usage##name__ __attribute__((unused)) = \
+#define USE_MIR_PASS(name__)                      \
+  extern bool mir_pass_registry##name__##_fake(); \
+  static bool mir_pass_usage##name__ UNUSED =     \
       mir_pass_registry##name__##_fake();
 
 #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index aceb047b64f54ac18ac492ef495d32c3180ad4b4..efd22fc22a4180c3cac9f269fc14f6541c16b885 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -72,7 +72,8 @@ const std::string& TargetToStr(TargetType target) {
                                               "npu",
                                               "xpu",
                                               "bm",
-                                              "mlu"};
+                                              "mlu",
+                                              "rknpu"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -113,7 +114,8 @@ const std::string& TargetRepr(TargetType target) {
                                               "kNPU",
                                               "kXPU",
                                               "kMLU",
-                                              "kBM"};
+                                              "kBM",
+                                              "kRKNPU"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index b2d1e6a08954c7f00ae24cfb6be43dac3b168228..2b271a4872e7e14c48632a2bb1aae56d53145cba 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -54,8 +54,9 @@ enum class TargetType : int {
   kXPU = 9,
   kBM = 10,
   kMLU = 11,
+  kRKNPU = 12,
   kAny = 6,  // any target
-  NUM = 12,  // number of fields.
+  NUM = 13,  // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,
@@ -101,7 +102,10 @@ enum class ActivationType : int {
   kTanh = 6,
   kSwish = 7,
   kExp = 8,
-  NUM = 9,
+  kAbs = 9,
+  kHardSwish = 10,
+  kReciprocal = 11,
+  NUM = 12,
 };
 
 static size_t PrecisionTypeLength(PrecisionType type) {
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 8f80f0014f7f6213a010035f581ad4dcb715aba1..1eb5af74d29f72fa90712d04c922958755d79265 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -42,11 +42,13 @@ USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
 USE_MIR_PASS(type_layout_cast_preprocess_pass);
 USE_MIR_PASS(memory_optimize_pass);
+USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(mlu_subgraph_pass);
 USE_MIR_PASS(mlu_postprocess_pass);
-USE_MIR_PASS(subgraph_cast_display_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
diff --git a/lite/api/python/CMakeLists.txt b/lite/api/python/CMakeLists.txt
index ba0c6eb2404ce1ffc2ad5950ee5a3476d42f01b8..5dfecf8c619d8cf9be7a03fa46b4e86a6e641a29 100644
--- a/lite/api/python/CMakeLists.txt
+++ b/lite/api/python/CMakeLists.txt
@@ -17,8 +17,12 @@ execute_process(
   OUTPUT_VARIABLE PADDLE_LITE_COMMIT
   OUTPUT_STRIP_TRAILING_WHITESPACE
 )
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
-    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-
+if(APPLE)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup_mac.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+else()
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+endif()
 add_subdirectory(pybind)
 #add_subdirectory(interface)
diff --git a/lite/api/python/__init__.py b/lite/api/python/__init__.py
index abf198b97e6e818e1fbe59006f98492640bcee54..72a75d9caaa79fa96e52e8603ae6886aac341009 100644
--- a/lite/api/python/__init__.py
+++ b/lite/api/python/__init__.py
@@ -11,3 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import os 
+import sys
+
+if os.name =='nt':
+    current_path = os.path.abspath(os.path.dirname(__file__))
+    third_lib_path = current_path + os.sep + 'libs'
+    os.environ['path'] =  third_lib_path+ ';' + os.environ['path']
+    sys.path.insert(0, third_lib_path)
diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt
index b1de18d50c1582b0f872ad38d24939665ab1d3b0..fe4cdb5a73d62afa98fb8c343e8a6a20388e293b 100644
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -3,7 +3,14 @@ if (NOT LITE_ON_TINY_PUBLISH)
    set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
 endif()
 
-lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+if(WIN32)
+   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+   target_link_libraries(lite_pybind ${os_dependency_modules})
+else()
+   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+endif(WIN32)
+
 if (LITE_ON_TINY_PUBLISH)
    set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
 endif()
diff --git a/lite/api/python/setup.py.in b/lite/api/python/setup.py.in
index 79028fb7493bf55eab74aa76ee51ac79f418ba0a..b04a6077f5aafecf76fed0b0dee5c56919b9302e 100644
--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
@@ -34,20 +34,27 @@ else:
 
 # core lib of paddlelite is stored as lite.so
 LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
-PACKAGE_DATA = {'paddlelite': ['lite.so']}
+PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']}
 # put all thirdparty libraries in paddlelite.libs
 PACKAGE_DATA['paddlelite.libs'] = []
 LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
     shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
-    PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
-
+    if os.name != 'nt':
+        PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
+    else:
+        PACKAGE_DATA['paddlelite.libs'] += ['libiomp5md.dll', 'mklml.dll']
+        shutil.copy('${MKLML_SHARED_LIB_DEPS}', LIB_PATH)
+        PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll']
 # link lite.so to paddlelite.libs
-COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
-/inference_lite_lib/python/install/lite/lite.so"
-if os.system(COMMAND) != 0:
-    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+if os.name != 'nt':
+    COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
+    /inference_lite_lib/python/install/lite/lite.so"
+    if os.system(COMMAND) != 0:
+        raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+
+  
 
 # remove unused paddle/libs/__init__.py
 if os.path.isfile(LIB_PATH+'/__init__.py'):
@@ -61,6 +68,14 @@ PACKAGE_DIR = {
     'paddlelite': LITE_PATH
 }
 
+if os.name == 'nt':
+    # fix the path separator under windows
+    fix_package_dir = {}
+    for k, v in PACKAGE_DIR.items():
+        fix_package_dir[k] = v.replace('/', '\\')
+    PACKAGE_DIR = fix_package_dir
+
+
 setup(
     name='paddlelite',
     version=PADDLELITE_VERSION,
diff --git a/lite/api/python/setup_mac.py.in b/lite/api/python/setup_mac.py.in
new file mode 100644
index 0000000000000000000000000000000000000000..c8dfe2cc5c13b3105fc1aed404676eefd40877e8
--- /dev/null
+++ b/lite/api/python/setup_mac.py.in
@@ -0,0 +1,73 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# module of pack whl installer for Paddle-lite
+
+import shutil
+import os
+from setuptools import setup, Distribution
+
+
+class BinaryDistribution(Distribution):
+    'binary distribution'
+    def has_ext_modules(foo):
+        return True
+
+
+# get paddle-lite version, if it's not based on a release tag, we use commit id instead
+PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
+PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
+if PADDLELITE_TAG == "":
+    PADDLELITE_VERSION = PADDLELITE_COMMITE
+else:
+    PADDLELITE_VERSION = PADDLELITE_TAG
+
+# core lib of paddlelite is stored as lite.so
+LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
+PACKAGE_DATA = {'paddlelite': ['lite.so']}
+# put all thirdparty libraries in paddlelite.libs
+PACKAGE_DATA['paddlelite.libs'] = []
+LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
+
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
+    shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
+    PACKAGE_DATA['paddlelite.libs'] += ['libmklml.dylib', 'libiomp5.dylib']
+
+# link lite.so to paddlelite.libs
+COMMAND = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}\
+/inference_lite_lib/python/install/lite/lite.so"
+if os.system(COMMAND) != 0:
+    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+
+# remove unused paddle/libs/__init__.py
+if os.path.isfile(LIB_PATH+'/__init__.py'):
+    os.remove(LIB_PATH+'/__init__.py')
+
+# set dir path of each package
+PACKAGE_DIR = {
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddlelite.libs': LIB_PATH,
+    'paddlelite': LITE_PATH
+}
+
+setup(
+    name='paddlelite',
+    version=PADDLELITE_VERSION,
+    description='Paddle-Lite Library',
+    packages=['paddlelite', 'paddlelite.libs'],
+    package_dir=PACKAGE_DIR,
+    package_data=PACKAGE_DATA,
+    distclass=BinaryDistribution
+)
diff --git a/lite/api/test_classify_lite_bm.cc b/lite/api/test_classify_lite_bm.cc
index 7da7dc03745aa623e35dec5b344e16de03cf5aca..b2507e28adbe050e4715e0c28a433a259607e7a9 100644
--- a/lite/api/test_classify_lite_bm.cc
+++ b/lite/api/test_classify_lite_bm.cc
@@ -36,7 +36,8 @@ void TestModel(const std::vector<Place>& valid_places) {
   predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
 
   auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
   auto* data = input_tensor->mutable_data<float>();
   auto item_size = input_tensor->dims().production();
   if (FLAGS_input_img_txt_path.empty()) {
@@ -67,15 +68,13 @@ void TestModel(const std::vector<Place>& valid_places) {
             << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
             << " ms in average.";
 
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-
-  auto* out_data = out->data<float>();
+  auto out = predictor.GetOutputs();
   FILE* fp = fopen("result.txt", "wb");
-  for (int i = 0; i < out->numel(); i++) {
-    fprintf(fp, "%f\n", out_data[i]);
+  for (int i = 0; i < out.size(); i++) {
+    auto* out_data = out[i]->data<float>();
+    for (int j = 0; j < out[i]->numel(); j++) {
+      fprintf(fp, "%f\n", out_data[j]);
+    }
   }
   fclose(fp);
 }
diff --git a/lite/api/test_helper.h b/lite/api/test_helper.h
index a17fc331310cfe17ec36be504b94ddacc724e90f..fa6e20230d68c73b0720606816a4594077278d56 100644
--- a/lite/api/test_helper.h
+++ b/lite/api/test_helper.h
@@ -15,7 +15,12 @@
 #pragma once
 
 #include <gflags/gflags.h>
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#include <windows.h>
+#include "lite/backends/x86/port.h"
+#endif
 #include <time.h>
 #include <cmath>
 
diff --git a/lite/api/test_yolov3_lite_bm.cc b/lite/api/test_yolov3_lite_bm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d70ecf3c03955286244aa13cfe65f19569a55930
--- /dev/null
+++ b/lite/api/test_yolov3_lite_bm.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <fstream>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+DEFINE_string(input_img_txt_path,
+              "",
+              "if set input_img_txt_path, read the img filename as input.");
+
+namespace paddle {
+namespace lite {
+
+void TestModel(const std::vector<Place>& valid_places) {
+  lite::Predictor predictor;
+  std::vector<std::string> passes;
+  predictor.Build(FLAGS_model_dir,
+                  FLAGS_model_dir + "/model",
+                  FLAGS_model_dir + "/params",
+                  valid_places,
+                  passes);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  if (FLAGS_input_img_txt_path.empty()) {
+    for (int i = 0; i < item_size; i++) {
+      data[i] = 1;
+    }
+  } else {
+    std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
+    if (!fs.is_open()) {
+      LOG(FATAL) << "open input_img_txt error.";
+    }
+    for (int i = 0; i < item_size; i++) {
+      fs >> data[i];
+    }
+  }
+  auto* image_tensor = predictor.GetInput(1);
+  image_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 2})));
+  data = image_tensor->mutable_data<float>();
+  data[0] = FLAGS_im_height;
+  data[1] = FLAGS_im_width;
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  auto out = predictor.GetOutputs();
+  FILE* fp = fopen("result.txt", "wb");
+  for (int i = 0; i < out.size(); i++) {
+    auto* out_data = out[i]->data<float>();
+    for (int j = 0; j < out[i]->numel(); j++) {
+      fprintf(fp, "%f\n", out_data[j]);
+    }
+  }
+  fclose(fp);
+}
+
+TEST(Yolov3, test_bm) {
+  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+
+  TestModel(valid_places);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/api/transform_test.cc b/lite/api/transform_test.cc
index 896b47a97fb20e6935764e12fbe9ebd646a4f816..e1c315f4a63ffd3ed8f51fa4b73ac88b50835cab 100644
--- a/lite/api/transform_test.cc
+++ b/lite/api/transform_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <gflags/gflags.h>
+#ifdef PADDLE_WITH_TESTING
 #include <gtest/gtest.h>
+#endif
 #include <string>
 #include <vector>
 #include "lite/api/cxx_api.h"
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index fb459ae3621d1281f0a2433ca6b237a165d078a1..1e8734a6e45ead93bb33024a2e918cdb401265d9 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -8,3 +8,4 @@ add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
 add_subdirectory(bm)
+add_subdirectory(rknpu)
diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc
index 9f478eab60538eeca38415afea4e0989eff5a04e..1d01642100109d14a413ad5e274606c88bf0005a 100644
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/backends/arm/math/activation.h"
+#include <algorithm>
 #include <string>
 #include "lite/backends/arm/math/funcs.h"
 
@@ -711,6 +712,47 @@ void act_square<float>(const float* din, float* dout, int size, int threads) {
   }
 }
 
+template <>
+void act_hard_swish<float>(const float* din,
+                           float* dout,
+                           int size,
+                           float threshold,
+                           float scale,
+                           float offset,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = std::min(std::max(0.f, ptr_in[0] + offset), threshold) *
+                 ptr_in[0] / scale;
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
+template <>
+void act_reciprocal<float>(const float* din,
+                           float* dout,
+                           int size,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = 1.0 / ptr_in[0];
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
+template <>
+void act_abs<float>(const float* din, float* dout, int size, int threads) {
+  for (int i = 0; i < size; ++i) {
+    dout[0] = (din[0] > 0 ? din[0] : -din[0]);
+    din++;
+    dout++;
+  }
+}
+
 #ifdef LITE_WITH_TRAIN
 template <>
 void act_square_grad(const float* din,
diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h
index 63f4418d70db25f98dea2a405de1f4bb6b0b9111..50f60f300bbab9b9f0bcad222f31699b7bfadeab 100644
--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -72,6 +72,20 @@ void act_rsqrt(const T* din, T* dout, int size, int threads);
 template <typename T>
 void act_square(const T* din, T* dout, int size, int threads);
 
+template <typename T>
+void act_hard_swish(const T* din,
+                    T* dout,
+                    int size,
+                    float threshold,
+                    float scale,
+                    float offset,
+                    int threads);
+template <typename T>
+void act_reciprocal(const T* din, T* dout, int size, int threads);
+
+template <typename T>
+void act_abs(const T* din, T* dout, int size, int threads);
+
 #ifdef LITE_WITH_TRAIN
 template <typename T>
 void act_square_grad(
diff --git a/lite/backends/arm/math/concat.cc b/lite/backends/arm/math/concat.cc
index 65f93453388d7f41d73669f583d189bec9035bb5..e54d70ffbb119d0a91b82f67b77c9d778dea17bf 100644
--- a/lite/backends/arm/math/concat.cc
+++ b/lite/backends/arm/math/concat.cc
@@ -16,46 +16,3 @@
 #include <algorithm>
 #include <limits>
 #include <memory>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void concat_func(const std::vector<lite::Tensor *> &input,
-                 const int axis,
-                 lite::Tensor *output) {
-  int64_t concat_input_size = 1;
-  int64_t num_cancats = 1;
-  auto dim_0 = input[0]->dims();
-  size_t num = input.size();
-  for (int i = axis + 1; i < dim_0.size(); i++) {
-    concat_input_size *= dim_0[i];
-  }
-  for (int i = 0; i < axis; i++) {
-    num_cancats *= dim_0[i];
-  }
-  float *dst_ptr = output->mutable_data<float>();
-  const int out_concat_axis = output->dims()[axis];
-  int64_t offset_concat_axis = 0;
-  int64_t out_sum = out_concat_axis * concat_input_size;
-  for (int n = 0; n < num; n++) {
-    auto dims = input[n]->dims();
-    const float *src_ptr = input[n]->data<float>();
-    int64_t in_concat_axis = dims[axis];
-    float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
-    int64_t in_sum = in_concat_axis * concat_input_size;
-    for (int i = 0; i < num_cancats; i++) {
-      std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
-      dout_ptr += out_sum;
-      src_ptr += in_sum;
-    }
-    offset_concat_axis += in_concat_axis;
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/concat.h b/lite/backends/arm/math/concat.h
index 4c6159e9e09b66edde812e5098e1263963f3e4da..44e8bf73e220f94dca4ba6713debfae77029867a 100644
--- a/lite/backends/arm/math/concat.h
+++ b/lite/backends/arm/math/concat.h
@@ -25,9 +25,39 @@ namespace lite {
 namespace arm {
 namespace math {
 
-void concat_func(const std::vector<lite::Tensor *> &input,
+template <typename T>
+void concat_func(const std::vector<lite::Tensor*>& input,
                  const int axis,
-                 lite::Tensor *output);
+                 lite::Tensor* output) {
+  size_t num = input.size();
+  auto dim_0 = input[0]->dims();
+  int64_t concat_input_size = 1;
+  int64_t num_cancats = 1;
+  for (int i = axis + 1; i < dim_0.size(); i++) {
+    concat_input_size *= dim_0[i];
+  }
+  for (int i = 0; i < axis; i++) {
+    num_cancats *= dim_0[i];
+  }
+
+  auto* dst_ptr = output->mutable_data<T>();
+  const int out_concat_axis = output->dims()[axis];
+  int64_t offset_concat_axis = 0;
+  int64_t out_sum = out_concat_axis * concat_input_size;
+  for (int n = 0; n < num; n++) {
+    auto dims = input[n]->dims();
+    auto* src_ptr = input[n]->data<T>();
+    int64_t in_concat_axis = dims[axis];
+    auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
+    int64_t in_sum = in_concat_axis * concat_input_size;
+    for (int i = 0; i < num_cancats; i++) {
+      std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum);
+      dout_ptr += out_sum;
+      src_ptr += in_sum;
+    }
+    offset_concat_axis += in_concat_axis;
+  }
+}
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc
index 47a4d427f5400212a80fc31336e462a1c48bd640..4d08c1e957d43b5b748ffdb90fd14a07a61d0183 100644
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -302,10 +302,10 @@ void elementwise_add_grad_broadcast<float>(const float* dout_grad,
                                            int pre,
                                            int n,
                                            int post) {
-  if (x_grad) {
+  if (x_grad != nullptr) {
     elementwise_add_grad(dout_grad, x_grad, pre * n * post);
   }
-  if (y_grad) {
+  if (y_grad != nullptr) {
     memset(y_grad, 0, n * sizeof(float));
 #pragma omp parallel for
     for (int i = 0; i < pre; ++i) {
@@ -582,10 +582,10 @@ void elementwise_sub_grad<float>(const float* dout_grad,
                                  float* x_grad,
                                  float* y_grad,
                                  int num) {
-  if (x_grad) {
+  if (x_grad != nullptr) {
     elementwise_add_grad(dout_grad, x_grad, num);
   }
-  if (y_grad) {
+  if (y_grad != nullptr) {
     int cnt = num >> 4;
     int remain = num & 0x0f;
     float32x4_t minus = vdupq_n_f32(-1);
@@ -624,10 +624,10 @@ void elementwise_sub_grad_broadcast<float>(const float* dout_grad,
                                            int pre,
                                            int n,
                                            int post) {
-  if (x_grad) {
+  if (x_grad != nullptr) {
     elementwise_add_grad(dout_grad, x_grad, pre * n * post);
   }
-  if (y_grad) {
+  if (y_grad != nullptr) {
     memset(y_grad, 0, n * sizeof(float));
 #pragma omp parallel for
     for (int i = 0; i < pre; ++i) {
diff --git a/lite/backends/arm/math/reduce_mean.cc b/lite/backends/arm/math/reduce_mean.cc
index 56104550d8d68e53ad9a2ac3148887d67480d6f6..a84eef2970b2837159609c1ded1ca0d9991ccfc6 100644
--- a/lite/backends/arm/math/reduce_mean.cc
+++ b/lite/backends/arm/math/reduce_mean.cc
@@ -198,6 +198,23 @@ void reduce_mean_hw<float>(const float* src,
   reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
 }
 
+template <>
+void mean_grad<float>(const float* out_grad, float* in_grad, int size) {
+  float grad = out_grad[0] / size;
+  float32x4_t grad_v = vdupq_n_f32(grad);
+  int loop = size >> 2;
+  int remain = size & 3;
+
+#pragma omp parallel for
+  for (int i = 0; i < loop; ++i) {
+    vst1q_f32(in_grad, grad_v);
+    in_grad += 4;
+  }
+  for (int i = 0; i < remain; ++i) {
+    in_grad[i] = grad;
+  }
+}
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/reduce_mean.h b/lite/backends/arm/math/reduce_mean.h
index 277ed209c058b5b4be76ce18a00683610e6afb7a..aaa9ff42c18d0cfa6a7cf11408dfba06a9444adc 100644
--- a/lite/backends/arm/math/reduce_mean.h
+++ b/lite/backends/arm/math/reduce_mean.h
@@ -83,6 +83,9 @@ void reduce_mean_all(const T* src,
                      int height_in,
                      int width_in);
 
+template <typename T>
+void mean_grad(const T* out_grad, T* in_grad, int size);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt
index 35f5f0ce2d93db59cbb856d8008e6f3138633e42..0689bb706ab3bac4b8b97059017181ef24dd8ee4 100644
--- a/lite/backends/cuda/CMakeLists.txt
+++ b/lite/backends/cuda/CMakeLists.txt
@@ -5,5 +5,7 @@ get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
 
 nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps})
 nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps})
+
+lite_cc_library(cuda_context SRCS context.cc DEPS device_info)
  
 add_subdirectory(math)
diff --git a/lite/backends/cuda/context.cc b/lite/backends/cuda/context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4bac4c442c28848d38bd434d045c7888a1a92ac8
--- /dev/null
+++ b/lite/backends/cuda/context.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/context.h"
+
+namespace paddle {
+namespace lite {}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/context.h b/lite/backends/cuda/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bed30a9603c6f6a48169ae31d66c989bd891836
--- /dev/null
+++ b/lite/backends/cuda/context.h
@@ -0,0 +1,170 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/cuda/blas.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+
+template <TargetType Type>
+class Context;
+
+using CUDAContext = Context<TargetType::kCUDA>;
+
+// Only works with CUDA kernels.
+template <>
+class Context<TargetType::kCUDA> {
+ public:
+  typename Env<TargetType::kCUDA>::Devs& devs =
+      Env<TargetType::kCUDA>::Global();
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {
+    if (devs.size() > 0) {
+      cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
+    } else {
+      LOG(INFO) << "No cuda device(s) found, CUDAContext init failed.";
+    }
+  }
+  void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
+    CHECK_GT(devs.size(), 0UL)
+        << "Env is not initialized or current target is not exit!";
+    if (dev_id >= static_cast<int>(devs.size())) {
+      LOG(WARNING) << "device index exceeds the number of devices, set to "
+                      "default device(0)!";
+      device_id_ = 0;
+    } else {
+      device_id_ = dev_id;
+    }
+    if (io_stream_id >= devs[dev_id].max_stream()) {
+      LOG(WARNING) << "data stream index exceeds the maximum stream number, "
+                      "set to default stream(0)!";
+      io_stream_id = 0;
+    }
+    if (exec_stream_id >= devs[dev_id].max_stream()) {
+      LOG(WARNING) << "exec stream index exceeds the maximum stream number, "
+                      "set to default stream(0)!";
+      exec_stream_id = 0;
+    }
+
+    exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id];
+    io_stream_ = devs[dev_id].io_streams()[io_stream_id];
+
+    exec_stream_id_ = exec_stream_id;
+    io_stream_id_ = io_stream_id;
+    need_sync_ = false;
+  }
+  void CopySharedTo(CUDAContext* ctx) {
+    CHECK(ctx);
+    CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
+    ctx->cublas_fp32_ = cublas_fp32_;
+  }
+
+  const cudaStream_t& exec_stream() const { return exec_stream_; }
+  void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
+
+  const cudaStream_t& io_stream() const { return io_stream_; }
+  void SetIoStream(cudaStream_t stream) { io_stream_ = stream; }
+
+  std::shared_ptr<cuda::Blas<float>> cublas_fp32() { return cublas_fp32_; }
+  void SetCuBlasFP32(std::shared_ptr<cuda::Blas<float>> cublas_fp32) {
+    cublas_fp32_ = cublas_fp32;
+  }
+
+  const std::vector<cudaEvent_t>& input_events() { return input_events_; }
+  void SetInputEvents(const std::vector<cudaEvent_t>& input_events) {
+    input_events_.clear();
+    input_events_.assign(input_events.begin(), input_events.end());
+  }
+
+  const std::vector<cudaEvent_t>& output_events() { return output_events_; }
+  void SetOutputEvents(const std::vector<cudaEvent_t>& output_events) {
+    output_events_.clear();
+    output_events_.assign(output_events.begin(), output_events.end());
+  }
+
+  std::vector<cudaStream_t> all_exec_streams() {
+    int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
+    return devs[dev_id].exec_streams();
+  }
+
+  void SetSyncStreams(const std::vector<int>& nums) {
+    sync_streams_.clear();
+    std::vector<cudaStream_t> exec_streams = all_exec_streams();
+    for (size_t i = 0; i < nums.size(); ++i) {
+      CHECK(nums[i] >= 0 && nums[i] < static_cast<int>(exec_streams.size()))
+          << "streams id is not valid";
+      sync_streams_.push_back(exec_streams[nums[i]]);
+    }
+    InitSyncEvents(nums.size());
+  }
+
+  void InitSyncEvents(const int num) {
+    sync_events_.clear();
+    for (int i = 0; i < num; ++i) {
+      cudaEvent_t eve;
+      TargetWrapperCuda::CreateEventWithFlags(&eve);
+      sync_events_.push_back(eve);
+    }
+  }
+
+  void SetNeedSync(bool sync) { need_sync_ = sync; }
+  bool need_sync() const { return need_sync_; }
+
+  void Sync() {
+    CHECK_EQ(sync_streams_.size(), sync_events_.size());
+    for (size_t i = 0; i < sync_events_.size(); ++i) {
+      TargetWrapperCuda::RecordEvent(sync_events_[i], sync_streams_[i]);
+      TargetWrapperCuda::StreamSync(exec_stream_, sync_events_[i]);
+    }
+  }
+
+  std::string name() const { return "CUDAContext"; }
+
+  CUDAContext& operator=(const CUDAContext& context) {
+    this->Init(
+        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
+    cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
+    return *this;
+  }
+
+ private:
+  int device_id_;
+  // overall information
+  int exec_stream_id_;
+  int io_stream_id_;
+  cudaStream_t exec_stream_;
+  cudaStream_t io_stream_;
+
+  // not thread-safe, should allocate for each thread.
+  std::shared_ptr<cuda::Blas<float>> cublas_fp32_;
+
+  // kernel information
+  std::vector<cudaEvent_t> input_events_;
+  std::vector<cudaEvent_t> output_events_;
+  // multi stream sync.
+  std::vector<cudaStream_t> sync_streams_;
+  std::vector<cudaEvent_t> sync_events_;
+  bool need_sync_;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/utils.h b/lite/backends/cuda/math/utils.h
index b6aa9c7d160ad6c8b60b132e4a2bbd7ae1e0b9ff..78aa689ff767e8a454dec3aa48a97ecefafdbe7a 100644
--- a/lite/backends/cuda/math/utils.h
+++ b/lite/backends/cuda/math/utils.h
@@ -29,6 +29,7 @@ enum class BinaryOperation {
   kADD = 0,
   kMUL = 1,
   kDIV = 2,
+  kSUB = 3,
 };
 
 template <typename T>
@@ -41,6 +42,7 @@ __device__ __forceinline__ float binary_calc(float x,
   if (type == BinaryOperation::kADD) return x + y;
   if (type == BinaryOperation::kMUL) return x * y;
   if (type == BinaryOperation::kDIV) return x / y;
+  if (type == BinaryOperation::kSUB) return x - y;
 }
 
 template <typename T>
diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc
index f0105e060f03df3e4d49c358cf314730cdd16393..eff959d992200592c21a024f56713b9abb4b87fb 100644
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -54,19 +51,20 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
 
 void CLContext::AddKernel(const std::string &kernel_name,
                           const std::string &file_name,
-                          const std::string &options) {
+                          const std::string &options,
+                          const std::string &time_stamp) {
   cl_int status{CL_SUCCESS};
   VLOG(3) << " --- to get program " << file_name << " --- ";
   auto program = GetProgram(file_name, options);
   VLOG(3) << " --- end get program --- ";
   VLOG(3) << " --- to create kernel: " << kernel_name << " --- ";
-  std::unique_ptr<cl::Kernel> kernel(
+  std::shared_ptr<cl::Kernel> kernel(
       new cl::Kernel(program, kernel_name.c_str(), &status));
   CL_CHECK_FATAL(status);
   VLOG(3) << " --- end create kernel --- ";
   kernels_.emplace_back(std::move(kernel));
   STL::stringstream kernel_key;
-  kernel_key << kernel_name << options;
+  kernel_key << kernel_name << options << time_stamp;
   kernel_offset_[kernel_key.str()] = kernels_.size() - 1;
 }
 
@@ -121,14 +119,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
   }
 }
 
+cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
+                                         size_t max_work_size,
+                                         int divisor) {
+  int preferred_lws = 0;
+#if 1
+  auto gws0 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws2 = global_work_size[2];
+#else
+  auto gws2 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws0 = global_work_size[2];
+#endif
+  if (divisor > 1) {
+    max_work_size /= divisor;
+  }
+  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+    max_work_size = preferred_lws;
+  }
+  while (gws1 > max_work_size && max_work_size > 0) {
+    gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
+  }
+  while (gws2 * gws1 > max_work_size && max_work_size > 0) {
+    gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
+  }
+  while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
+    gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
+  }
+#if 1
+  return cl::NDRange{static_cast<size_t>(gws0),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws2)};
+#else
+  return cl::NDRange{static_cast<size_t>(gws2),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws0)};
+#endif
+}
+
 cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
                                      size_t max_work_size) {
   int preferred_lws = 0;
   int divisor = 2;
 
-  auto tmp0 = global_work_size[0];
-  auto tmp1 = global_work_size[1];
-  auto tmp2 = global_work_size[2];
+  auto gws0 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws2 = global_work_size[2];
 
   if (divisor > 1) {
     max_work_size /= divisor;
@@ -136,18 +173,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
   if (preferred_lws > 0 && preferred_lws <= max_work_size) {
     max_work_size = preferred_lws;
   }
-  while (tmp1 > max_work_size && max_work_size > 0) {
-    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
+  while (gws1 > max_work_size && max_work_size > 0) {
+    gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
   }
-  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
-    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
+  while (gws2 * gws1 > max_work_size && max_work_size > 0) {
+    gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
   }
-  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
-    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
+  while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
+    gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
   }
-  return cl::NDRange{static_cast<size_t>(tmp0),
-                     static_cast<size_t>(tmp1),
-                     static_cast<size_t>(tmp2)};
+  return cl::NDRange{static_cast<size_t>(gws0),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws2)};
 }
 
 }  // namespace lite
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
index 1964c4bf56b55841ba735c79b2f7a17dc1ed451e..41059a0d42a95bbffed4c41611b9f3b8ac60861c 100644
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -27,6 +27,21 @@ namespace lite {
 
 class CLContext {
  public:
+  ~CLContext() {
+    for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
+      // Note(ysh329): Don't need `clReleaseKernel`
+      kernels_[kidx].reset();
+    }
+    kernels_.clear();
+    kernel_offset_.clear();
+    for (auto &p : programs_) {
+      // Note(ysh329): Dont't need `clReleaseProgram`
+      p.second.reset();
+    }
+    programs_.clear();
+    LOG(INFO) << "release cl::Program, cl::Kernel finished.";
+  }
+
   cl::CommandQueue &GetCommandQueue();
 
   cl::Context &GetContext();
@@ -36,7 +51,8 @@ class CLContext {
 
   void AddKernel(const std::string &kernel_name,
                  const std::string &file_name,
-                 const std::string &options = "");
+                 const std::string &options = "",
+                 const std::string &time_stamp = "");
 
   cl::Kernel &GetKernel(const int index);
 
@@ -46,9 +62,15 @@ class CLContext {
 
   cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
 
+  cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
+                                size_t max_work_size,
+                                int divitor = 2);
+  //  cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
+  //                                   size_t max_work_size);
+
  private:
   std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
-  std::vector<std::unique_ptr<cl::Kernel>> kernels_;
+  std::vector<std::shared_ptr<cl::Kernel>> kernels_;
   std::map<std::string, int> kernel_offset_;
 };
 
diff --git a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
index cb29860dc7556bdaea3c09589a8c6120c5ef2a1a..08491d5d9fd195430a4b03673c38767f7e4a5be8 100644
--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -55,17 +55,20 @@ __kernel void relu6(__read_only image2d_t input,
 __kernel void sigmoid(__read_only image2d_t input,
                       __write_only image2d_t output,
                       __private const float threshold,
-                   __private const float scale) {
+                      __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height
 
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out = 1 / (1 + exp(-in));
+  CL_DTYPE4 out;
+  out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
+  out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
+  out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
+  out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
+
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
 
diff --git a/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..72b0b66f9737ce0ca9c740e6d4e399d06eaf2cd8
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
@@ -0,0 +1,152 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void decode_center_size(__read_only image2d_t prior_box_image,
+                                __read_only image2d_t prior_box_var_image,
+                                __read_only image2d_t target_box_image,
+                                __write_only image2d_t output_image,
+                                __private const int out_C,
+                                __private const int out_H){
+                        const int out_c = get_global_id(0);
+                        const int out_nh = get_global_id(1);
+                        const int out_h = out_nh % out_H;
+                        const int out_n =  1;
+
+                        const int prior_box_n = 1;
+                        const int prior_box_c = 0;
+                        const int prior_box_h = out_h;
+
+                        const int prior_box_var_n = 1;
+                        const int prior_box_var_c = 0;
+                        const int prior_box_var_h = out_h;
+
+                        const int target_box_n = 1;
+                        const int target_box_c = out_c;
+                        const int target_box_h = out_h;
+
+                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                 CLK_ADDRESS_CLAMP      |
+                                                 CLK_FILTER_NEAREST;
+                        int2  prior_box_pos;
+                        int2  prior_box_var_pos;
+                        int2  target_box_pos;
+                        int2  output_pos;
+
+                        prior_box_pos.x = prior_box_c * 4;
+                        prior_box_pos.y = prior_box_n * prior_box_h;
+
+                        prior_box_var_pos.x = prior_box_var_c * 4;
+                        prior_box_var_pos.y = prior_box_var_n * prior_box_var_h;
+
+                        target_box_pos.x = target_box_c * 4;
+                        target_box_pos.y = target_box_n * target_box_h;
+
+                        output_pos.x = out_c * 4;
+                        output_pos.y = out_n * out_h;
+
+                        CL_DTYPE4 prior_box_input[4];
+                        CL_DTYPE4 prior_box_var_input[4];
+                        CL_DTYPE4 target_box_input[4];
+
+                        prior_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 0, prior_box_pos.y));
+                        prior_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 1, prior_box_pos.y));
+                        prior_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 2, prior_box_pos.y));
+                        prior_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 3, prior_box_pos.y));
+
+                        prior_box_var_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 0, prior_box_var_pos.y));
+                        prior_box_var_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 1, prior_box_var_pos.y));
+                        prior_box_var_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 2, prior_box_var_pos.y));
+                        prior_box_var_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, 
+                                                (int2)(prior_box_var_pos.x + 3, prior_box_var_pos.y));
+
+                        target_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 0,target_box_pos.y));
+                        target_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 1, target_box_pos.y));
+                        target_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 2, target_box_pos.y));
+                        target_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 3, target_box_pos.y));
+
+                        CL_DTYPE prior_box_width = prior_box_input[2].x - prior_box_input[0].x;
+                        CL_DTYPE prior_box_height = prior_box_input[3].x - prior_box_input[1].x;
+                        CL_DTYPE prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(CL_DTYPE)2;
+                        CL_DTYPE prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(CL_DTYPE)2;
+
+                        CL_DTYPE4 target_box_center_x;
+                        CL_DTYPE4 target_box_center_y;
+                        CL_DTYPE4 target_box_width;
+                        CL_DTYPE4 target_box_height;
+                        CL_DTYPE4 output[4];
+
+                        output[0] = 0.0f;
+                        output[1] = 0.0f;
+                        output[2] = 0.0f;
+                        output[3] = 0.0f;
+
+                        target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x;
+                        target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y;
+                        target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width;
+                        target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height;
+
+                        output[0].x = target_box_center_x.x - target_box_width.x/(half)2;
+                        output[1].x = target_box_center_y.x - target_box_height.x/(half)2;
+                        output[2].x = target_box_center_x.x + target_box_width.x/(half)2;
+                        output[3].x = target_box_center_y.x + target_box_height.x/(half)2;
+
+                        if(out_C - out_c * 4 >= 2){
+                            target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x;
+                            target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y;
+                            target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width;
+                            target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height;
+                            output[0].y = target_box_center_x.y - target_box_width.y/(half)2;
+                            output[1].y = target_box_center_y.y - target_box_height.y/(half)2;
+                            output[2].y = target_box_center_x.y + target_box_width.y/(half)2;
+                            output[3].y = target_box_center_y.y + target_box_height.y/(half)2;
+                        }
+                        if(out_C - out_c * 4 >= 3){
+                            target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x;
+                            target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y;
+                            target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width;
+                            target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height;
+                            output[0].z = target_box_center_x.z - target_box_width.z/(half)2;
+                            output[1].z = target_box_center_y.z - target_box_height.z/(half)2;
+                            output[2].z = target_box_center_x.z + target_box_width.z/(half)2;
+                            output[3].z = target_box_center_y.z + target_box_height.z/(half)2;
+                        }
+                        if(out_C - out_c * 4 >= 4){
+                            target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x;
+                            target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y;
+                            target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width;
+                            target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height;
+                            output[0].w = target_box_center_x.w - target_box_width.w/(half)2;
+                            output[1].w = target_box_center_y.w - target_box_height.w/(half)2;
+                            output[2].w = target_box_center_x.w + target_box_width.w/(half)2;
+                            output[3].w = target_box_center_y.w + target_box_height.w/(half)2;
+                        }
+
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
index 8cdec7beabafc2701b6522fcb6492eff76353279..73a089d7591b98486daac2d4aaa29fe4f2192134 100644
--- a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
@@ -30,6 +30,143 @@ __kernel void elementwise_mul(__global image2d_t input,
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
 }
 
+__kernel void channel_mul(__global image2d_t input,
+                          __global image2d_t bias,
+                          __write_only image2d_t outputImage,
+                          int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * biase;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+// etc : 1 1 1 72
+// run time Y  [value,0,0,0] * 72
+__kernel void channel_mul_d2(__global image2d_t input,
+                             __global image2d_t bias,
+                             __write_only image2d_t outputImage,
+                             int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias0;
+  int2 coords_bias1;
+  int2 coords_bias2;
+  int2 coords_bias3;
+  /*  if (x == 0 && y == 0) {
+      CL_DTYPE4 b = (CL_DTYPE4){0, 0, 0, 0};
+  #define PPI(j, k)                                                          \
+    b = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2){j, k});                            \
+    printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \
+           convert_float(b.y), convert_float(b.z), convert_float(b.w));
+      for (int i = 0; i < 73; ++i) {
+        PPI(i, 0);
+      }
+  #undef PPI
+    }*/
+  coords_bias0.x = x / w * 4;
+  coords_bias0.y = 0;
+  coords_bias1.x = x / w * 4 + 1;
+  coords_bias1.y = 0;
+  coords_bias2.x = x / w * 4 + 2;
+  coords_bias2.y = 0;
+  coords_bias3.x = x / w * 4 + 3;
+  coords_bias3.y = 0;
+  CL_DTYPE4 biase0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias0);
+  CL_DTYPE4 biase1 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias1);
+  CL_DTYPE4 biase2 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias2);
+  CL_DTYPE4 biase3 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias3);
+  /*  if (x == 0 && y == 0) {
+      printf("bias0={ %f , %f , %f , %f }\n ",
+             convert_float(biase0.x), convert_float(biase0.y),
+             convert_float(biase0.z), convert_float(biase0.w));
+      printf("bias1={ %f , %f , %f , %f }\n ",
+             convert_float(biase1.x), convert_float(biase1.y),
+             convert_float(biase1.z), convert_float(biase1.w));
+      printf("bias2={ %f , %f , %f , %f }\n ",
+             convert_float(biase2.x), convert_float(biase2.y),
+             convert_float(biase2.z), convert_float(biase2.w));
+      printf("bias3={ %f , %f , %f , %f }\n ",
+             convert_float(biase3.x), convert_float(biase3.y),
+             convert_float(biase3.z), convert_float(biase3.w));
+    }*/
+  CL_DTYPE4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 output = mad(in, biase, 0);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+// c 1 1
+__kernel void channel_mul_d3(__global image2d_t input,
+                             __global image2d_t bias,
+                             __write_only image2d_t outputImage,
+                             int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * biase;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+__kernel void channel_mul_d4(__global image2d_t input,
+__global image2d_t bias,
+                          __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * biase;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+#if 0 // TODO(ysh329): comment code below
+__kernel void elementwise_mul(__global image2d_t input,
+                              __global image2d_t bias,
+                              __write_only image2d_t outputImage) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords);
+  CL_DTYPE4 output = in * biase;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
 
 __kernel void channel_mul_d1(__read_only image2d_t input,
                              __read_only image2d_t bias,
@@ -184,4 +321,4 @@ __kernel void channel_mul_d4(__read_only image2d_t input,
 
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
 }
-
+#endif
diff --git a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl
index b5346e3af4f472d5ed095d586cb68122655cf1c4..3e3d65394f9924edac735084c2fe5ce550f20684 100644
--- a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl
@@ -14,14 +14,127 @@ limitations under the License. */
 
 #include <cl_common.h>
 
+// onnx/pytorch instancenorm by lijian
+__kernel void instance_norm_onnx(__private const int in_width,
+                                 __private const int in_height,
+                                 __private const int in_c_group,
+                                 __private const int local_work_size_x,
+                                 __private const int local_work_size_y,
+                                 __private const float epsilon,
+                                 __read_only image2d_t input,
+                                 __write_only image2d_t output) {
+  const int out_cn = get_global_id(0);
+  const int n = out_cn / in_c_group;
+  const int c = out_cn % in_c_group;
+  const int w = get_local_id(1);
+  const int h = get_local_id(2);
+  const int local_id = w * local_work_size_y + h;
+  const int local_total_size = local_work_size_x * local_work_size_y;
 
-__kernel void instance_norm(__read_only image2d_t input,
-                            __write_only image2d_t output,
-                            __read_only image2d_t scale,
-                            __read_only image2d_t bias,
-                            const float epsilon,
-                            const int in_h,
-                            const int in_w){
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#ifdef LOCAL_MEM_128
+  __local float4 shared_mem[128];
+#elif defined(LOCAL_MEM_64)
+  __local float4 shared_mem[64];
+#else
+  __local float4 shared_mem[256];
+#endif
+  int xOffset = c * in_width;
+  int yOffset = n * in_height;
+  float4 sum = 0.0f;
+  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
+    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
+      sum += read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex));
+    }
+  }
+  shared_mem[local_id] = sum;
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  sum = 0.0f;
+  if (local_id < 32) {
+    for (int i = local_id + 32; i < local_total_size; i += 32) {
+      sum += shared_mem[i];
+    }
+  }
+  shared_mem[local_id] += sum;
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  sum = 0.0f;
+  if (local_id == 0) {
+    int top = min(32, local_total_size);
+    for (int i = 0; i < top; i += 1) {
+      sum += shared_mem[i];
+    }
+    shared_mem[0] = sum / (in_width * in_height);
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  const float4 mean_val = shared_mem[0];
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  sum = 0.0f;
+  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
+    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
+      float4 temp = read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)) - mean_val;
+      sum += temp * temp;
+    }
+  }
+  shared_mem[local_id] = sum;
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  sum = 0.0f;
+  if (local_id < 32) {
+    for (int i = local_id + 32; i < local_total_size; i += 32) {
+      sum += shared_mem[i];
+    }
+  }
+  shared_mem[local_id] += sum;
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  sum = 0.0f;
+  if (local_id == 0) {
+    int top = min(32, local_total_size);
+    for (int i = 0; i < top; i += 1) {
+      sum += shared_mem[i];
+    }
+    shared_mem[0] = sum / (in_width * in_height);
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  const float4 sigma = sqrt(shared_mem[0] + (float4)(epsilon));
+
+  float4 s = 1 / sigma;
+
+  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
+    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
+      int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex);
+      float4 in_val = read_imagef(input, sampler, intout_pos);
+      half4 out_val = convert_half4((in_val - mean_val) * s);
+#ifdef RELU
+      out_val = activation(out_val);
+#endif
+      write_imageh(output, intout_pos, out_val);
+    }
+  }
+}
+
+
+// paddle instancenorm by zhangxi
+__kernel void instance_norm_paddle(__read_only image2d_t input,
+                                   __write_only image2d_t output,
+                                   __read_only image2d_t scale,
+                                   __read_only image2d_t bias,
+                                   const float epsilon,
+                                   const int in_h,
+                                   const int in_w){
     __local CL_DTYPE4 saved_mean[1024];
     __local CL_DTYPE4 saved_variance[1024];
     const int lid = get_local_id(0);
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index 52009718803d7b98ebae481db547713e97b313c7..d5b2d70b09a84cb405c0e7c8f2b55f4254eb7f64 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,13 +26,15 @@ CLRuntime* CLRuntime::Global() {
 
 CLRuntime::~CLRuntime() {
   if (command_queue_ != nullptr) {
+    command_queue_->flush();
     command_queue_->finish();
   }
-  // For controlling the destruction order:
+  // For controlling the destruction order
   command_queue_.reset();
   context_.reset();
   device_.reset();
   platform_.reset();
+  device_info_.clear();
 }
 
 bool CLRuntime::Init() {
@@ -128,6 +127,12 @@ bool CLRuntime::InitializePlatform() {
 }
 
 bool CLRuntime::InitializeDevice() {
+  // ===================== BASIC =====================
+  // CL_DEVICE_TYPE_GPU
+  // CL_DEVICE_NAME
+  // CL_DEVICE_SUPPORT
+  // CL_DEVICE_MAX_COMPUTE_UNITS
+  // CL_DEVICE_MAX_CLOCK_FREQUENCY
   std::vector<cl::Device> all_devices;
   status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
   CL_CHECK_ERROR(status_);
@@ -140,27 +145,153 @@ bool CLRuntime::InitializeDevice() {
 
   auto device_name = device_->getInfo<CL_DEVICE_NAME>();
   LOG(INFO) << "Using device: " << device_name;
+
+  cl_device_type device_type = device_->getInfo<CL_DEVICE_TYPE>();
+  auto device_type_to_str = [](cl_device_type t) -> std::string {
+    std::string t_str{""};
+    switch (t) {
+      case CL_DEVICE_TYPE_CPU:
+        t_str = "CPU";
+        break;
+      case CL_DEVICE_TYPE_GPU:
+        t_str = "GPU";
+        break;
+      case CL_DEVICE_TYPE_ACCELERATOR:
+        t_str = "Accelerator";
+        break;
+      case CL_DEVICE_TYPE_DEFAULT:
+        t_str = "Default";
+        break;
+      default:
+        t_str = "Unknown";
+    }
+    return t_str;
+  };
+  LOG(INFO) << "device_type:" << device_type_to_str(device_type);
+  device_info_["CL_DEVICE_TYPE"] = device_type;
+
+  auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+  LOG(INFO) << "The chosen device has " << max_units << " compute units.";
+  device_info_["CL_DEVICE_MAX_COMPUTE_UNITS"] = max_units;
+
+  auto max_clock_freq = device_->getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
+  LOG(INFO) << "CL_DEVICE_MAX_CLOCK_FREQUENCY:" << max_clock_freq;
+  device_info_["CL_DEVICE_MAX_CLOCK_FREQUENCY"] = max_clock_freq;
+
+  // ===================== MEMORY =====================
+  // CL_DEVICE_LOCAL_MEM_SIZE
+  // CL_DEVICE_GLOBAL_MEM_CACHE_SIZE
+  // CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+  // CL_DEVICE_GLOBAL_MEM_SIZE
+  auto local_mem_kb =
+      static_cast<float>(device_->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()) / 1024;
+  LOG(INFO) << "The local memory size of the chosen device is " << local_mem_kb
+            << " KB.";
+  device_info_["CL_DEVICE_LOCAL_MEM_SIZE_KB"] = local_mem_kb;
+
+  auto global_mem_cache_size_kb =
+      static_cast<float>(device_->getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>()) /
+      1024;
+  LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHE_SIZE(KB):"
+            << global_mem_cache_size_kb << " KB.";
+  device_info_["CL_DEVICE_GLOBAL_MEM_CACHE_SIZE_KB"] = global_mem_cache_size_kb;
+
+  auto global_mem_cacheline_size_kb =
+      static_cast<float>(
+          device_->getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>()) /
+      1024;
+  LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE(KB):"
+            << global_mem_cacheline_size_kb << " KB.";
+  device_info_["CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE_KB"] =
+      global_mem_cacheline_size_kb;
+
+  auto global_mem_size_kb =
+      static_cast<float>(device_->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()) / 1024;
+  LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_SIZE(KB):" << global_mem_size_kb << " KB.";
+  device_info_["CL_DEVICE_GLOBAL_MEM_SIZE_KB"] = global_mem_size_kb;
+
+  // ===================== WORK_GROUP =====================
+  // CL_DEVICE_MAX_WORK_GROUP_SIZE
+  // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
+  // CL_DEVICE_MAX_WORK_ITEM_SIZES
+  auto max_work_group_size = device_->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+  LOG(INFO) << "CL_DEVICE_MAX_WORK_GROUP_SIZE:" << max_work_group_size;
+  device_info_["CL_DEVICE_MAX_WORK_GROUP_SIZE"] = max_work_group_size;
+
+  auto max_dims_num = device_->getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>();
+  LOG(INFO) << "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:" << max_dims_num;
+  device_info_["CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS"] = max_dims_num;
+
+  auto max_work_item_sizes = device_->getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
+  for (size_t i = 0; i < max_work_item_sizes.size(); ++i) {
+    LOG(INFO) << "max_work_item_sizes[" << i << "]:" << max_work_item_sizes[i];
+    std::string dim_key = "CL_DEVICE_MAX_WORK_ITEM_SIZES_" + std::to_string(i);
+    device_info_[dim_key] = max_work_item_sizes[i];
+  }
+
+  // ===================== BUFFER =====================
+  // CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
+  auto max_constant_buffer_size_kb =
+      static_cast<float>(
+          device_->getInfo<CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>()) /
+      1024;
+  LOG(INFO) << "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:"
+            << max_constant_buffer_size_kb;
+  device_info_["CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE"] =
+      max_constant_buffer_size_kb;
+
+  // ===================== IMAGE =====================
+  // CL_DEVICE_IMAGE_SUPPORT
+  // CL_DEVICE_IMAGE2D_MAX_HEIGHT
+  // CL_DEVICE_IMAGE2D_MAX_WIDTH
   auto image_support = device_->getInfo<CL_DEVICE_IMAGE_SUPPORT>();
   if (image_support) {
     LOG(INFO) << "The chosen device supports image processing.";
+    device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 1;
   } else {
     LOG(INFO) << "The chosen device doesn't support image processing!";
+    device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 0;
     return false;
   }
+
+  auto image2d_max_height = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+  LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_HEIGHT:" << image2d_max_height;
+  device_info_["CL_DEVICE_IMAGE2D_MAX_HEIGHT"] = image2d_max_height;
+
+  auto image2d_max_width = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+  LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_WIDTH:" << image2d_max_width;
+  device_info_["CL_DEVICE_IMAGE2D_MAX_WIDTH"] = image2d_max_width;
+
+  // ===================== OTHERS / EXTENSION / VERSION =====================
+  // CL_DEVICE_EXTENSIONS
+  // CL_DEVICE_ADDRESS_BITS
   auto ext_data = device_->getInfo<CL_DEVICE_EXTENSIONS>();
   VLOG(4) << "The extensions supported by this device: " << ext_data;
   if (ext_data.find("cl_khr_fp16") != std::string::npos) {
     LOG(INFO) << "The chosen device supports the half data type.";
+    device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 1;
   } else {
     LOG(INFO) << "The chosen device doesn't support the half data type!";
+    device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 0;
   }
-  auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-  LOG(INFO) << "The chosen device has " << max_units << " compute units.";
-  auto local_mem = device_->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
-  LOG(INFO) << "The local memory size of the chosen device is "
-            << static_cast<float>(local_mem) / 1024 << " KB.";
+
+  auto address_bits = device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
+  LOG(INFO) << "CL_DEVICE_ADDRESS_BITS:" << address_bits;
+  device_info_["CL_DEVICE_ADDRESS_BITS"] = address_bits;
+
+  auto driver_version = device_->getInfo<CL_DRIVER_VERSION>();
+  LOG(INFO) << "CL_DRIVER_VERSION:" << driver_version;
+
   return true;
 }
 
+std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
+  if (0 != device_info_.size()) {
+    return device_info_;
+  }
+  InitializeDevice();
+  return device_info_;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h
index 6683a5d92df02ae3a95f2e1b01feb2f303da8558..503b3a011642a8e018781c08647a958c521e6fac 100644
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -55,8 +52,10 @@ class CLRuntime {
 
   void set_cl_path(std::string cl_path) { cl_path_ = cl_path; }
 
+  std::map<std::string, size_t>& GetDeviceInfo();
+
  private:
-  CLRuntime() = default;
+  CLRuntime() { Init(); }
 
   ~CLRuntime();
 
@@ -84,6 +83,8 @@ class CLRuntime {
     return queue;
   }
 
+  std::map<std::string, size_t> device_info_;
+
   std::string cl_path_;
 
   std::shared_ptr<cl::Platform> platform_{nullptr};
diff --git a/lite/backends/opencl/cl_utility.h b/lite/backends/opencl/cl_utility.h
index b7f14c15e61ba050220ef0819fa9c3d13a7b8606..de01f896a6eb461eb24023a77935bba07de029e7 100644
--- a/lite/backends/opencl/cl_utility.h
+++ b/lite/backends/opencl/cl_utility.h
@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error);
         __FILE__,                                                    \
         __LINE__);                                                   \
   }
-
+#ifndef LITE_SHUTDOWN_LOG
 #define CL_CHECK_FATAL(err_code__)                                   \
   if (err_code__ != CL_SUCCESS) {                                    \
     LOG(FATAL) << string_format(                                     \
@@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error);
         __FILE__,                                                    \
         __LINE__);                                                   \
   }
+#else
+#define CL_CHECK_FATAL(err_code__)
+#endif
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/rknpu/CMakeLists.txt b/lite/backends/rknpu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cec60c80759cfc02e25a82eb795746c8b93e7cfe
--- /dev/null
+++ b/lite/backends/rknpu/CMakeLists.txt
@@ -0,0 +1,5 @@
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+
+lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs})
diff --git a/lite/backends/rknpu/device.cc b/lite/backends/rknpu/device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b486259b3b328713062648df445f94735ae6380
--- /dev/null
+++ b/lite/backends/rknpu/device.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/rknpu/device.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace rknpu {
+
+std::unique_ptr<rk::nn::Exection> Device::Build(
+    std::string& model_name,                                   // NOLINT
+    rk::nn::Graph* rk_graph,                                   // NOLINT
+    std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes,  // NOLINT
+    std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes  // NOLINT
+    ) {
+  VLOG(3) << "[RKNPU] Build model";
+
+  rk_graph->SetInputsOutputs(input_nodes, output_nodes);
+
+  std::unique_ptr<rk::nn::Exection> exector =
+      std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(rk_graph));
+
+  exector->Build();
+
+  return exector;
+}
+
+}  // namespace rknpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/rknpu/device.h b/lite/backends/rknpu/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..9284725aac7fbd9840aef64b7e8f411059f9ba15
--- /dev/null
+++ b/lite/backends/rknpu/device.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "rknpu/rknpu_pub.h"  // NOLINT
+
+namespace paddle {
+namespace lite {
+namespace rknpu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {}
+
+  // Build the RK IR graph to om model, return RK model exector to
+  // load om model and run inference.
+  std::unique_ptr<rk::nn::Exection> Build(
+      std::string& model_name,                                   // NOLINT
+      rk::nn::Graph* rk_graph,                                   // NOLINT
+      std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes,  // NOLINT
+      std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes  // NOLINT
+      );                                                         // NOLINT
+
+ private:
+};
+
+}  // namespace rknpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/CMakeLists.txt b/lite/backends/x86/CMakeLists.txt
index 63b41ae77d0f3949e3d1de13f9db5ca99b4f1c41..38b47ae3120608c7950a1f081e9ec2b133fb955e 100644
--- a/lite/backends/x86/CMakeLists.txt
+++ b/lite/backends/x86/CMakeLists.txt
@@ -10,7 +10,7 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
 lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
-lite_cc_library(x86_cpu_info SRCS cpu_info.cc DEPS xbyak)
+lite_cc_library(x86_cpu_info SRCS cpu_info.cc)
 
 add_subdirectory(jit)
 add_subdirectory(math)
diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc
index a05a57e93b23008e49683764b5ed669d5c425e5b..2aaa798fa94b7dd47e4dc15d50e663b8fd3c083a 100644
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
@@ -262,7 +262,7 @@ void* GetTensorRtDsoHandle() {
 
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib");
+  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml.dylib");
 #elif defined(_WIN32)
   return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll");
 #else
diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc
index 7d051aa6f5802844753b71fd43400e20b7f5965b..a3376be423828b25c6eda6fff30a56578c7bbbe5 100644
--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
@@ -28,6 +28,12 @@
 #define posix_memalign_free free
 #endif
 
+#ifdef _WIN32
+#define posix_memalign_free _aligned_free
+#define posix_memalign(p, a, s) \
+  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#endif
+
 // DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
 bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");
 
@@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
 void* GenBase::operator new(size_t size) {
   void* ptr;
   constexpr size_t alignment = 32ul;
+#ifdef _WIN32
+  ptr = _aligned_malloc(size, alignment);
+#else
   PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
                     0,
                     "GenBase Alloc %ld error!",
                     size);
+#endif
   PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
   return ptr;
 }
diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc
index 8d61fb3bbb97705c697fba934e6cab9424f85bad..9cf3281152840416dc141f98992499c663783b7a 100644
--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -96,8 +96,8 @@ class BeamSearchFunctor<TARGET(kX86), T> {
     //        : nullptr;
 
     // fill in data
-    std::vector<size_t> low_level;
-    size_t low_offset = 0;
+    std::vector<uint64_t> low_level;
+    uint64_t low_offset = 0;
     for (auto &items : selected_items) {
       low_level.push_back(low_offset);
       for (auto &item : items) {
diff --git a/lite/backends/x86/math/beam_search_test.cc b/lite/backends/x86/math/beam_search_test.cc
index 904870207b08d462025ecb4b84d6cf57f7b13f26..233fa03fbaa31165dae4453affb148276f8c6584 100644
--- a/lite/backends/x86/math/beam_search_test.cc
+++ b/lite/backends/x86/math/beam_search_test.cc
@@ -22,8 +22,8 @@ void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
                        paddle::framework::LoDTensor* pre_scores) {
   // lod
   paddle::framework::LoD lod;
-  std::vector<size_t> level0({0, 2, 4});
-  std::vector<size_t> level1({0, 1, 2, 3, 4});
+  std::vector<uint64_t> level0({0, 2, 4});
+  std::vector<uint64_t> level1({0, 1, 2, 3, 4});
   lod.push_back(level0);
   lod.push_back(level1);
   ids->set_lod(lod);
diff --git a/lite/backends/x86/math/blas_impl.h b/lite/backends/x86/math/blas_impl.h
index 72d0736268f342187f0be8c6348f5bed75df30ea..34b258892be05625ae88076eff175f56a53d3537 100644
--- a/lite/backends/x86/math/blas_impl.h
+++ b/lite/backends/x86/math/blas_impl.h
@@ -483,7 +483,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
              mat_a.data<T>(),
              mat_b.data<T>(),
              beta,
-             mat_out->mutable_data<T>());
+             mat_out->template mutable_data<T>());
 }
 
 template <>
@@ -759,7 +759,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
                            mat_a.data<T>(),
                            mat_b.data<T>(),
                            beta,
-                           mat_out->mutable_data<T>());
+                           mat_out->template mutable_data<T>());
   } else {
     PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
                    dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
@@ -773,7 +773,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
         mat_a.data<T>(),
         mat_b.data<T>(),
         beta,
-        mat_out->mutable_data<T>(),
+        mat_out->template mutable_data<T>(),
         dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
         dim_a.stride_,
         dim_b.stride_);
diff --git a/lite/backends/x86/math/concat_and_split.cc b/lite/backends/x86/math/concat_and_split.cc
index bec93dde41fdb654cfbfd20f5d9e59d1d372e3a8..df75654aebaba26b9889d97445bd889cdf2f4eb0 100644
--- a/lite/backends/x86/math/concat_and_split.cc
+++ b/lite/backends/x86/math/concat_and_split.cc
@@ -51,7 +51,7 @@ class ConcatFunctor<lite::TargetType::kX86, T> {
     // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
 
     // computation
-    auto output_data = output->mutable_data<T>();
+    auto output_data = output->template mutable_data<T>();
     int col_idx = 0;
     for (int j = 0; j < num; ++j) {
       int col_len = input_cols[j];
@@ -108,7 +108,7 @@ class SplitFunctor<lite::TargetType::kX86, T> {
         int col_len = output_cols[j];
         auto* out_tensor = outputs->at(j);
         if (out_tensor != nullptr) {
-          T* dst_ptr = out_tensor->mutable_data<T>() + k * col_len;
+          T* dst_ptr = out_tensor->template mutable_data<T>() + k * col_len;
           std::copy_n(src_ptr + col_idx, col_len, dst_ptr);
           // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
           //             sizeof(T) * col_len);
diff --git a/lite/backends/x86/math/cross_entropy.cc b/lite/backends/x86/math/cross_entropy.cc
index 366486924a8c4a5eefd6341183b4f1bc1c0277ad..941a34643669f060cdd18f38f92c39e529da7b19 100644
--- a/lite/backends/x86/math/cross_entropy.cc
+++ b/lite/backends/x86/math/cross_entropy.cc
@@ -50,8 +50,8 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
                 .reshape(batch_axis_remain)
                 .sum(Eigen::DSizes<int, 1>(1)));
     } else {
-      const T* prob_data = prob->data<T>();
-      T* loss_data = out->mutable_data<T>();
+      const T* prob_data = prob->template data<T>();
+      T* loss_data = out->template mutable_data<T>();
 
       const int64_t* label_data = labels->data<int64_t>();
       for (int i = 0; i < batch_size; ++i) {
diff --git a/lite/backends/x86/math/im2col.cc b/lite/backends/x86/math/im2col.cc
index 1c4c6a49f5bb804a57344c59368d18255e8a7912..b916c912ffc2a4d62b63b98fdce150b353ba087e 100644
--- a/lite/backends/x86/math/im2col.cc
+++ b/lite/backends/x86/math/im2col.cc
@@ -99,7 +99,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
 
     int channels_col = im_channels * filter_height * filter_width;
 
-    T* im_data = im->mutable_data<T>();
+    T* im_data = im->template mutable_data<T>();
     const T* col_data = col.data<T>();
 
     for (int c = 0; c < channels_col; ++c) {
@@ -161,7 +161,7 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
     int col_width = col->dims()[1];
 
     const T* im_data = im.data<T>();
-    T* col_data = col->mutable_data<T>();
+    T* col_data = col->template mutable_data<T>();
 
     for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
       for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
@@ -235,7 +235,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
         "col_width and padding(padding_left, padding_right) are "
         "inconsistent.");
 
-    T* im_data = im->mutable_data<T>();
+    T* im_data = im->template mutable_data<T>();
     const T* col_data = col.data<T>();
 
     for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
diff --git a/lite/backends/x86/math/im2col_cfo_cpu.h b/lite/backends/x86/math/im2col_cfo_cpu.h
index 4623f045bb1cbe67605b36621efcc3285b989ad5..97579647d4ec3a9a95e033a153417cb0aaadbeb6 100644
--- a/lite/backends/x86/math/im2col_cfo_cpu.h
+++ b/lite/backends/x86/math/im2col_cfo_cpu.h
@@ -42,7 +42,7 @@ inline void im2col_common(const lite::Tensor& im,
   int channels_col = im_channels * filter_height * filter_width;
 
   const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
   for (int c = 0; c < channels_col; ++c) {
     int w_offset = c % filter_width;
     int h_offset = (c / filter_width) % filter_height;
@@ -77,7 +77,7 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im,
   int output_width = col->dims()[4];
 
   const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
   int col_matrix_width = output_width * output_height;
   int im_size = im_height * im_width;
   size_t copy_size = sizeof(T) * output_width;
@@ -123,7 +123,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im,
   constexpr int prw = 1;
 
   const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
   int im_size = im_height * im_width;
   int col_matrix_width = output_width * output_height;
   int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc
index a17807e8a997f0ecf908313a4cb205676e4fa4b8..05a10b5a19fbc8e80ee6dd07e67154d9cf6d1b22 100644
--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -65,7 +65,7 @@ struct TensorSetConstantCPU {
       : tensor_(tensor), value_(value) {}
   template <typename T>
   void apply() const {
-    auto* begin = tensor_->mutable_data<T>(lite::TargetType::kX86);
+    auto* begin = tensor_->template mutable_data<T>(lite::TargetType::kX86);
     std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
   }
   lite::Tensor* tensor_;
@@ -126,7 +126,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
 
     const T* input_data = input.data<T>();
     const T* vector_data = vector.data<T>();
-    T* output_data = output->mutable_data<T>();
+    T* output_data = output->template mutable_data<T>();
     for (int64_t i = 0; i < in_dims[0]; ++i) {
       for (int64_t j = 0; j < size; ++j) {
         output_data[i * in_dims[0] + j] =
diff --git a/lite/backends/x86/math/math_function_impl.h b/lite/backends/x86/math/math_function_impl.h
index 3aaca2e59370f8f2b922554ec6f378bb2a3de9b5..acfb76759f6fc9fa4122afd2388bc3adf8f5ea22 100644
--- a/lite/backends/x86/math/math_function_impl.h
+++ b/lite/backends/x86/math/math_function_impl.h
@@ -83,7 +83,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(out->numel(), size);
 
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
 
     for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -129,7 +129,7 @@ class RowwiseMean<lite::TargetType::kX86, T> {
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(out->numel(), height);
     auto inv_size = 1.0 / size;
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
 
     for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -173,7 +173,7 @@ class RowwiseSum<lite::TargetType::kX86, T> {
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(out->numel(), height);
 
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
 
     for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
diff --git a/lite/backends/x86/math/maxouting.cc b/lite/backends/x86/math/maxouting.cc
index 20b40fe7c5000cc1d0ee80c18efa5d1defc911f0..f97b16f7fb3326a6d2eb186e2984df3dbd0a0a90 100644
--- a/lite/backends/x86/math/maxouting.cc
+++ b/lite/backends/x86/math/maxouting.cc
@@ -35,7 +35,7 @@ class MaxOutFunctor<lite::TargetType::kX86, T> {
     // c_size means the output size of each sample
     int c_size = fea_size * output_channels;
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
 
     for (int i = 0; i < batch_size; ++i) {
       int new_bindex = c_size * i;
@@ -72,7 +72,8 @@ class MaxOutGradFunctor<lite::TargetType::kX86, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     for (int i = 0; i < batch_size; ++i) {
       int blen = fea_size * output_channels * i;
diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc
index ab6c1edb481f914d5751149aca2595fee550ca51..4393c42157bb7667ec2218e8b76f05a2c60bcc86 100644
--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -54,8 +54,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    const T* input_data = input->template data<T>();
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
 
     int hstart, hend;
     int wstart, wend;
@@ -137,7 +137,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     int hstart, hend;
     int wstart, wend;
@@ -220,7 +221,8 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -322,7 +324,7 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const int output_stride = output_depth * output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
 
     int dstart, dend;
     int hstart, hend;
@@ -425,7 +427,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     int dstart, dend;
     int hstart, hend;
@@ -530,7 +533,8 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
diff --git a/lite/backends/x86/math/sample_prob.h b/lite/backends/x86/math/sample_prob.h
index 5312b3df10a41444c073f0cf61d69bce6fc3859a..4351df68a2630c2b8c6f7285f3955a9b06165f67 100644
--- a/lite/backends/x86/math/sample_prob.h
+++ b/lite/backends/x86/math/sample_prob.h
@@ -58,11 +58,11 @@ class SampleWithProb {
     const int64_t* label_data = L->data<int64_t>();
     // int64_t* samples_data =
     //    S->mutable_data<int64_t>(ret_dim, Target);
-    // T* probabilities_data = P->mutable_data<T>(ret_dim, Target);
+    // T* probabilities_data = P->template mutable_data<T>(ret_dim, Target);
     S->Resize({batch_size, num_sampled_classes});
     auto* samples_data = S->mutable_data<int64_t>(Target);
     P->Resize({batch_size, num_sampled_classes});
-    auto* probabilities_data = P->mutable_data<T>(Target);
+    auto* probabilities_data = P->template mutable_data<T>(Target);
 
     // temp sets for unique sampling
     std::unordered_set<int64_t> tmp_samples;
diff --git a/lite/backends/x86/math/search_fc.cc b/lite/backends/x86/math/search_fc.cc
index 56fc363cb48ec5c58f4a7ee3e62a2e6bd7355021..014b213d4f10f7161dc1881d582cca93f2be58e5 100644
--- a/lite/backends/x86/math/search_fc.cc
+++ b/lite/backends/x86/math/search_fc.cc
@@ -42,7 +42,7 @@ class SearchFcFunctor<lite::TargetType::kX86, T> {
     lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
 
     const auto bottom_data = bottom.data<T>();
-    auto top_data = top->mutable_data<T>(lite::TargetType::kX86);
+    auto top_data = top->template mutable_data<T>(lite::TargetType::kX86);
     const auto weights = w.data<T>();
     auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
     call_gemm<lite::X86Context, T>(blas,
diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc
index f8f1b42361832771ba04d1bdc8b3e2e05f954e29..acb377e31ccac96547fc4f0644332cfad36d66bc 100644
--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -52,7 +52,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
     PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
     PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
 
-    auto* out_data = out_value->mutable_data<T>();
+    auto* out_data = out_value->template mutable_data<T>();
     auto* in1_data = in1_value.data<T>();
     std::copy_n(in1_data, in1_value.numel(), out_data);
 
@@ -87,7 +87,7 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
     functor(context, output, 0.0);
 
     auto* in1_data = in1_value.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
 
     for (size_t i = 0; i < in1_rows.size(); i++) {
       for (int64_t j = 0; j < in1_row_numel; j++) {
@@ -127,7 +127,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
     in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
 
     auto* in1_data = in1_value.data<T>();
-    auto* in2_data = in2_value->mutable_data<T>();
+    auto* in2_data = in2_value->template mutable_data<T>();
     std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset);
   }
 };
@@ -161,7 +161,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
     input2->set_rows(in2_rows);
 
     auto* in2_value = input2->mutable_value();
-    T* in2_data = in2_value->mutable_data<T>();
+    T* in2_data = in2_value->template mutable_data<T>();
     auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
     size_t offset = 0u;
     for (size_t i = 0u; i != input1.size(); ++i) {
@@ -194,7 +194,7 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
     PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
 
     auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->mutable_data<T>();
+    auto* input2_data = input2->template mutable_data<T>();
 
     for (size_t i = 0; i < in1_rows.size(); i++) {
       for (int64_t j = 0; j < in1_row_numel; j++) {
@@ -305,7 +305,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
     lite::DDim dims(std::vector<int64_t>(
         {static_cast<int64_t>(merged_row_set.size()), input_width}));
     out.mutable_value()->Resize(dims);
-    auto* out_data = out.mutable_value()->mutable_data<T>();
+    auto* out_data = out.mutable_value()->template mutable_data<T>();
 
     if (merged_row_set.size() == row_num && !sorted_result) {
       // no duplicated ids, just concat the result together
@@ -385,7 +385,7 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
     PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
 
     auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
+    auto* input2_data = input2->template data<T>();
 
     // FIXME(typhoonzero): use macro fix the below messy code.
     switch (op) {
diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc
index c12c05414d717dce706590a491ccae2384f3bfe5..aa7aeac532e2fa1f90d452924b364be1896ee862 100644
--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
@@ -24,10 +24,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
  public:
   void operator()(const lite::Context<lite::TargetType::kX86>& context,
                   const lite::Tensor& src,
-                  const std::vector<size_t>& index_lod,
+                  const std::vector<uint64_t>& index_lod,
                   lite::Tensor* dst,
                   bool is_src_index) {
-    const size_t* index = index_lod.data();
+    const uint64_t* index = index_lod.data();
     const auto& src_dims = src.dims();
     const auto& dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(
@@ -39,7 +39,7 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
-    auto* dst_data = dst->mutable_data<T>();
+    auto* dst_data = dst->template mutable_data<T>();
     const int sz = width * sizeof(T);
     if (is_src_index) {
       for (int i = 0; i < height; ++i) {
diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h
index a70cc5bf73522f97ab312fc48553b5316dbf8376..63df008b6dfca936265019a71ac0a553c525dc73 100644
--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
@@ -36,7 +36,7 @@ class CopyMatrixRowsFunctor {
   // The indexed rows are based on the input index.
   void operator()(const lite::Context<Target>& context,
                   const lite::Tensor& src,
-                  const std::vector<size_t>& index_lod,
+                  const std::vector<uint64_t>& index_lod,
                   lite::Tensor* dst,
                   bool is_src_index);
 };
@@ -130,8 +130,8 @@ class LoDTensor2BatchFunctor {
     // batch_lods[2] is the sort order for the input LoDTensor.
     batch_lods->at(2).resize(seq_info.size());
 
-    size_t* batch_starts = batch_lods->at(0).data();
-    size_t* seq2batch_idx = batch_lods->at(1).data();
+    auto* batch_starts = batch_lods->at(0).data();
+    auto* seq2batch_idx = batch_lods->at(1).data();
     batch_starts[0] = 0;
     for (int n = 0; n < max_seqlen; n++) {
       auto batch_id = static_cast<int>(batch_starts[n]);
@@ -148,7 +148,7 @@ class LoDTensor2BatchFunctor {
       }
       batch_starts[n + 1] = static_cast<size_t>(batch_id);
     }
-    size_t* seq_order = batch_lods->at(2).data();
+    auto* seq_order = batch_lods->at(2).data();
     for (size_t i = 0; i < seq_info.size(); ++i) {
       seq_order[i] = seq_info[i].seq_idx;
     }
diff --git a/lite/backends/x86/math/sequence_padding.cc b/lite/backends/x86/math/sequence_padding.cc
index fbb6c11a5f7a0cbae36d2f8fba0b141dadadf542..eb977dc2d23f4cfaeec7dd5a6e2834ca23345f76 100644
--- a/lite/backends/x86/math/sequence_padding.cc
+++ b/lite/backends/x86/math/sequence_padding.cc
@@ -22,15 +22,15 @@ namespace math {
 template <typename T>
 void CopyValidData(lite::Tensor* dst_tensor,
                    const lite::Tensor* src_tensor,
-                   const std::vector<size_t>& seq_offsets,
+                   const std::vector<uint64_t>& seq_offsets,
                    int pad_seq_len,
                    int step_width,
                    bool norm_by_len,
                    CopyType type,
                    PadLayout layout) {
   int seq_num = seq_offsets.size() - 1;
-  const T* src_data = src_tensor->data<T>();
-  T* dst_data = dst_tensor->mutable_data<T>();
+  const T* src_data = src_tensor->template data<T>();
+  T* dst_data = dst_tensor->template mutable_data<T>();
 
   int seq_cpy_gap = step_width;
   int pad_cpy_gap =
@@ -113,7 +113,7 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
                    "'step_width'.");
 
     // fill padding value
-    T* pad_data = pad_tensor->mutable_data<T>();
+    T* pad_data = pad_tensor->template mutable_data<T>();
     const T* pad_value_data = pad_value.data<T>();
     if (pad_value.numel() == 1) {
       fast_mem_init<T>(
diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h
index a3f4512042de4c7a2fc665f2fd41777d472225f5..43407014dea0ed0c78ab29da7fb8ebb0e0310566 100644
--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
@@ -30,10 +30,10 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
 
 enum CopyType { kSeqToPad, kPadToSeq };
 
-inline static size_t MaximumSequenceLength(
-    const std::vector<size_t>& seq_offset) {
-  size_t seq_num = seq_offset.size() - 1;
-  size_t max_seq_len = 0;
+inline static uint64_t MaximumSequenceLength(
+    const std::vector<uint64_t>& seq_offset) {
+  uint64_t seq_num = seq_offset.size() - 1;
+  uint64_t max_seq_len = 0;
   for (size_t i = 0; i < seq_num; ++i) {
     max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
   }
@@ -42,7 +42,7 @@ inline static size_t MaximumSequenceLength(
 
 inline static void CheckDims(const lite::DDim& seq_tensor_dims,
                              const lite::DDim& pad_tensor_dims,
-                             const std::vector<size_t>& seq_offset,
+                             const std::vector<uint64_t>& seq_offset,
                              int64_t padded_seq_len,
                              int64_t step_width,
                              const PadLayout& layout) {
diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc
index 186b8b5543c7132867093616c83b45ae8ff27d3c..34c55c5714e467954bc1bb79d9b1385ef5cfe497 100644
--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
@@ -55,7 +55,7 @@ class MaxSeqPoolFunctor {
 
     auto starts = input.lod()[0];
     const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
+    T* out_data = output->template mutable_data<T>();
     int* max_index = index->mutable_data<int>();
 
     int64_t num_seq = out_dims[0];
@@ -103,7 +103,7 @@ class MaxSeqPoolFunctor<T, true> {
 
     auto starts = input.lod()[0];
     const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
+    T* out_data = output->template mutable_data<T>();
 
     int64_t num_seq = out_dims[0];
     int64_t dim = output->numel() / num_seq;
@@ -145,7 +145,7 @@ class MaxSeqPoolGradFunctor {
 
     const T* og_data = out_grad.data<T>();
     const int* max_index = index.data<int>();
-    T* ig_data = in_grad->mutable_data<T>();
+    T* ig_data = in_grad->template mutable_data<T>();
 
     SetConstant<TARGET(kX86), T> set_zero;
     set_zero(context, in_grad, static_cast<T>(0.0));
@@ -170,7 +170,7 @@ class LastSeqPoolFunctor {
                   lite::Tensor* output) {
     // Create pointers to input and output data
     auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
 
     // Calculate the size of each item in sequence
     int64_t item_size = input.numel() / input.dims()[0];
@@ -203,7 +203,7 @@ class FirstSeqPoolFunctor {
                   lite::Tensor* output) {
     // Create pointers to input and output data
     auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
 
     // Calculate the size of each item in sequence
     int64_t item_size = input.numel() / input.dims()[0];
@@ -238,7 +238,7 @@ class SumSeqPoolGradFunctor {
     int64_t in_w = in_grad->numel() / in_grad->dims()[0];
     PADDLE_ENFORCE(in_w == out_w);
     const T* out_g_data = out_grad.data<T>();
-    T* in_g_data = in_grad->mutable_data<T>(TARGET(kX86));
+    T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
     auto blas = math::GetBlas<TARGET(kX86), T>(context);
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
@@ -288,7 +288,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
     auto lod = input.lod()[0];
     if (pooltype == "SUM") {
       const T* src = input.data<T>();
-      T* dst = output->mutable_data<T>(TARGET(kX86));
+      T* dst = output->template mutable_data<T>(TARGET(kX86));
       jit::seq_pool_attr_t attr(
           static_cast<int>(input.numel() / input.dims()[0]),
           jit::SeqPoolType::kSum);
diff --git a/lite/backends/x86/math/sequence_pooling_test.cc b/lite/backends/x86/math/sequence_pooling_test.cc
index a73014767345842f09ac2ff0cd5c2e7231c1f90a..b91f43a571994bef95650361a6dc62c0465837a7 100644
--- a/lite/backends/x86/math/sequence_pooling_test.cc
+++ b/lite/backends/x86/math/sequence_pooling_test.cc
@@ -101,13 +101,13 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
 
 TEST(SequencePoolingGrad, CPU_SUM) {
   paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
+  lod1.push_back(std::vector<uint64_t>{0, 10});
   TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
                          paddle::platform::CPUPlace,
                          float>(lod1);
 
   paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
   TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
                          paddle::platform::CPUPlace,
                          float>(lod2);
@@ -116,13 +116,13 @@ TEST(SequencePoolingGrad, CPU_SUM) {
 #ifdef PADDLE_WITH_CUDA
 TEST(SequencePoolingGrad, CUDA_SUM) {
   paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
+  lod1.push_back(std::vector<uint64_t>{0, 10});
   TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
                          paddle::platform::CUDAPlace,
                          float>(lod1);
 
   paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
   TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
                          paddle::platform::CUDAPlace,
                          float>(lod2);
diff --git a/lite/backends/x86/math/sequence_scale.cc b/lite/backends/x86/math/sequence_scale.cc
index fad0628de15379b58847827cc3d48bf6085cbda2..25c7be0d0e2747f4f28c1d82f8855872d57726d1 100644
--- a/lite/backends/x86/math/sequence_scale.cc
+++ b/lite/backends/x86/math/sequence_scale.cc
@@ -32,7 +32,7 @@ class ScaleLoDTensorFunctor<lite::TargetType::kX86, T> {
     size_t seq_width = seq->dims()[1];
     lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod);
 
-    T* seq_data = seq->mutable_data<T>(lite::TargetType::kX86);
+    T* seq_data = seq->template mutable_data<T>(lite::TargetType::kX86);
     for (size_t i = 0; i < num_seq; ++i) {
       for (size_t j = lod[level][i] * seq_width;
            j < lod[level][i + 1] * seq_width;
diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.cc b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
index 035a7923c70f91cf27f1d845f68110f8f33cb73d..97e27fed59f4bc1a4c457ea9cf515da6caca9a1c 100644
--- a/lite/backends/x86/math/sequence_topk_avg_pooling.cc
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
@@ -83,7 +83,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
     auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86);
 
     int offset = 0;
-    std::vector<size_t> vec_out_lod;
+    std::vector<uint64_t> vec_out_lod;
     vec_out_lod.reserve(batch_size + 1);
     for (int i = 0; i <= batch_size; ++i) {
       offset = row_lod[i];
@@ -95,7 +95,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
     out->set_lod(lod_temp);
 
     auto in_data = in.data<T>();
-    auto out_data = out->mutable_data<T>(lite::TargetType::kX86);
+    auto out_data = out->template mutable_data<T>(lite::TargetType::kX86);
 
     T* sum_data = new T[max_k];
     for (int i = 0; i < batch_size; ++i) {
diff --git a/lite/backends/x86/math/softmax_impl.h b/lite/backends/x86/math/softmax_impl.h
index ec45377bc55154a4a36ebc5c3684ab7efeeef88e..1ba84dda42093155b10fa74a49e953d6663b8c88 100644
--- a/lite/backends/x86/math/softmax_impl.h
+++ b/lite/backends/x86/math/softmax_impl.h
@@ -108,8 +108,8 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
     const int num_remain = num_classes / axis_dim;
 
     if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* in_data = X->data<T>();
-      auto* out_data = Y->mutable_data<T>();
+      const T* in_data = X->template data<T>();
+      auto* out_data = Y->template mutable_data<T>();
       for (int bs = 0; bs < batch_size; ++bs) {
         T max_val = *std::max_element(in_data, in_data + num_classes);
         max_val *= static_cast<T>(-1);
@@ -219,9 +219,9 @@ class SoftmaxGradFunctor<Target, T, enable_if_CPU<Target>> {
     const int num_remain = num_classes / axis_dim;
 
     if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* out_data = y->data<T>();
-      const T* out_grad = y_grad->data<T>();
-      T* in_grad = x_grad->mutable_data<T>();
+      const T* out_data = y->template data<T>();
+      const T* out_grad = y_grad->template data<T>();
+      T* in_grad = x_grad->template mutable_data<T>();
       for (int bs = 0; bs < batch_size; ++bs) {
         T scalar;
         vec_mul_reduce<T, lite::x86::avx>(
diff --git a/lite/backends/x86/math/tree2col.cc b/lite/backends/x86/math/tree2col.cc
index 20b913331308c8b8c95d190b6b0b3d76ccac354b..bfc7084c9ff018101ca3dfc1d1748083b1449662 100644
--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
@@ -104,12 +104,12 @@ class Tree2ColFunctor<lite::TargetType::kX86, T> {
     patch_size = processing_list.size();
 
     // T *patch_data =
-    //    patch->mutable_data<T>({static_cast<int64_t>(patch_size),
+    //    patch->template mutable_data<T>({static_cast<int64_t>(patch_size),
     //                            static_cast<int64_t>(patch_elem_size)},
     //                           cpu_place);
     patch->Resize({static_cast<int64_t>(patch_size),
                    static_cast<int64_t>(patch_elem_size)});
-    auto *patch_data = patch->mutable_data<T>(lite::TargetType::kX86);
+    auto *patch_data = patch->template mutable_data<T>(lite::TargetType::kX86);
     constant(context, patch, 0);
     const T *features = node_features.data<T>();
 
@@ -166,12 +166,12 @@ class Col2TreeFunctor<lite::TargetType::kX86, T> {
       }
     }
     // T *grad_data =
-    //    in_grad->mutable_data<T>({static_cast<int64_t>(node_count),
+    //    in_grad->template mutable_data<T>({static_cast<int64_t>(node_count),
     //                              static_cast<int64_t>(grad_elem_size)},
     //                             cpu_place);
     in_grad->Resize({static_cast<int64_t>(node_count),
                      static_cast<int64_t>(grad_elem_size)});
-    auto *grad_data = in_grad->mutable_data<T>(lite::TargetType::kX86);
+    auto *grad_data = in_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     constant(context, in_grad, 0);
     const T *out_g = out_grad.data<T>();
diff --git a/lite/backends/x86/math/unpooling.cc b/lite/backends/x86/math/unpooling.cc
index 568f9952cab755c8441695e1a9266a2001d2b9a9..119d7294e9ec21e67f09776ad20d04f15b8b81ce 100644
--- a/lite/backends/x86/math/unpooling.cc
+++ b/lite/backends/x86/math/unpooling.cc
@@ -36,7 +36,7 @@ class Unpool2dMaxFunctor<lite::TargetType::kX86, T> {
     int output_feasize = output_height * output_width;
     const T* input_data = input.data<T>();
     const int* indices_data = indices.data<int>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
@@ -70,7 +70,8 @@ class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> {
     int output_feasize = output_height * output_width;
     const int* indices_data = indices.data<int>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
diff --git a/lite/backends/x86/math/vol2col.cc b/lite/backends/x86/math/vol2col.cc
index 8fd5e8954e2010d5226d56ac4a87a44e6364c8c6..91979bb7fdcfe66d84ded3f9797144ddafc8769e 100644
--- a/lite/backends/x86/math/vol2col.cc
+++ b/lite/backends/x86/math/vol2col.cc
@@ -75,7 +75,7 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
                       "mismatching.");
 
     const T* vol_data = vol.data<T>();
-    T* col_data = col->mutable_data<T>();
+    T* col_data = col->template mutable_data<T>();
 
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
@@ -159,7 +159,7 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
                       output_width,
                       "input_width and output_width are "
                       "mismatching.");
-    T* vol_data = vol->mutable_data<T>();
+    T* vol_data = vol->template mutable_data<T>();
     const T* col_data = col.data<T>();
 
     for (int c = 0; c < channels_col; ++c) {
diff --git a/lite/backends/x86/parallel.h b/lite/backends/x86/parallel.h
index 0689ec4c234509cee6f10f8e0f7dd432edae5c4e..49794b8e15a8f90a6512798baa842534df879f6b 100644
--- a/lite/backends/x86/parallel.h
+++ b/lite/backends/x86/parallel.h
@@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() {
   // Do not support nested omp parallem.
   num_threads = omp_in_parallel() ? 1 : omp_get_max_threads();
 #endif
-  return std::max(num_threads, 1L);
+  return std::max<int>(num_threads, 1L);
 }
 
 using ThreadHandler =
diff --git a/lite/backends/x86/port.h b/lite/backends/x86/port.h
index c1b81159aca979efe4b46777a1cef49e44b95e27..0e1e2b77b796eae201c55edcd3caecc263e4271e 100644
--- a/lite/backends/x86/port.h
+++ b/lite/backends/x86/port.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
+#include <time.h>
 #include <cstdio>
 #include <stdexcept>
 
-#include <time.h>
 #include <memory>
 #include <string>
 
@@ -37,7 +37,9 @@
 #define GOOGLE_GLOG_DLL_DECL
 #include <io.h>  // _popen, _pclose
 #include <stdio.h>
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
 #include <windows.h>
+#include <winsock.h>
 #include <numeric>  // std::accumulate in msvc
 #ifndef S_ISDIR     // windows port for sys/stat.h
 #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
@@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) {
   return reinterpret_cast<void *>(hModule);
 }
 
+extern struct timeval;
 static int gettimeofday(struct timeval *tp, void *tzp) {
   time_t clock;
   struct tm tm;
diff --git a/lite/backends/xpu/CMakeLists.txt b/lite/backends/xpu/CMakeLists.txt
index 4491fdeaefe9f16265bdee2c07ebb02b86a2b038..85bef0452c41ce35c90d9bd058bb7fdefd030f3a 100644
--- a/lite/backends/xpu/CMakeLists.txt
+++ b/lite/backends/xpu/CMakeLists.txt
@@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU)
   return()
 endif()
 
-lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+if(LITE_WITH_XTCL)
+  lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+endif()
+lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
diff --git a/lite/backends/xpu/device.h b/lite/backends/xpu/device.h
index 6de18d5466da6e6b791363d2e275ea72376c78b8..a2cc3206d3d0391d89690026561f47983e9376c9 100644
--- a/lite/backends/xpu/device.h
+++ b/lite/backends/xpu/device.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
-#include <xtcl/xtcl.h>
 #include <cstdlib>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/xpu/math.h b/lite/backends/xpu/math.h
new file mode 100644
index 0000000000000000000000000000000000000000..48352736d45a20d9abd496d9dd10b000d3f15a28
--- /dev/null
+++ b/lite/backends/xpu/math.h
@@ -0,0 +1,219 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+#include <cmath>
+#include <cstdlib>
+#include <utility>
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+namespace math {
+
+static inline long round_half_to_even(const float src) {  // NOLINT
+  long ret = llround(src);                                // NOLINT
+  if (fabs(fabs(round(src) - src) - 0.5) > 0) {
+    return ret;
+  } else {
+    if (abs(ret) % 2 == 0) {
+      return ret;
+    } else {
+      return ret + (ret > 0 ? -1 : 1);
+    }
+  }
+}
+
+static float ieee_compliance_0(float f) {
+  uint32_t *ptr = reinterpret_cast<uint32_t *>(&f);
+  uint32_t sign = (*ptr) & 0x80000000;
+  uint32_t uf = 0;
+  // nan -> inf
+  if (std::isnan(f)) {
+    uf = (sign | 0x7F800000);
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  } else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) {
+    return f;
+  } else {
+    // denormal -> +-0
+    uf = 0x0;
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  }
+}
+
+template <typename T, int RMAX>
+static inline T fp32_to_intx(const float f, float max) {
+  max = ieee_compliance_0(max);
+  float input = ieee_compliance_0(f);
+  // +0 and -0 -> +0
+  if (input == 0) {
+    input = 0.0f;
+  }
+
+  float tmp = RMAX / max;
+  if (std::isinf(tmp)) {
+    uint32_t *ptr = reinterpret_cast<uint32_t *>(&input);
+    if ((*ptr) >> 31 & 1) {
+      return T(-RMAX);
+    } else {
+      return T(RMAX);
+    }
+  }
+
+  tmp = input * tmp;
+  if (std::isnan(tmp)) {
+    return T(RMAX);
+  }
+
+  tmp = ieee_compliance_0(tmp);
+  // early check to avoid INF or big value get into convertor func.
+  if (tmp > RMAX) {
+    return T(RMAX);
+  }
+  if (tmp < -RMAX) {
+    return T(-RMAX);
+  }
+  T ret = (T)round_half_to_even(tmp);
+  if (ret > RMAX) {
+    ret = T(RMAX);
+  }
+  if (ret < -RMAX) {
+    ret = T(-RMAX);
+  }
+  return ret;
+}
+
+static inline int16_t fp32_to_int16(const float f, float max) {
+  int16_t v1 = fp32_to_intx<int16_t, 32767>(f, max);
+  return v1;
+}
+
+static inline int ConvertFP32ToInt16(const void *input,
+                                     void *output,
+                                     float max_val,
+                                     int len) {
+  for (int i = 0; i < len; i++) {
+    static_cast<int16_t *>(output)[i] =
+        fp32_to_int16(static_cast<const float *>(input)[i], max_val);
+  }
+  return 0;
+}
+
+static inline float FindMaxAbs(const float *data, int len) {
+  float max_f = 0.0f;
+  for (int i = 0; i < len; ++i) {
+    float max = std::abs(data[i]);
+    if (max > max_f) {
+      max_f = max;
+    }
+  }
+  return max_f;
+}
+
+template <typename T>
+static inline void Transpose(const T *in, T *out, int h, int w) {
+  for (int h1 = 0; h1 < w; ++h1) {
+    for (int w1 = 0; w1 < h; ++w1) {
+      out[h1 * h + w1] = in[w1 * w + h1];
+    }
+  }
+}
+
+/**
+ * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
+ * original x_dim is returned.
+ */
+static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return lite::DDim({1, x_dim[0]});
+}
+
+/**
+ * Get column matrix shape from a vector shape. If the rank of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return lite::DDim({y_dim[0], 1});
+}
+
+/**
+ * Matrix Descriptor of a memory buffer.
+ *
+ * It is used for Blas::MatMul. MatMul operator can be batched.
+ * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
+ * `batch_size` times of GEMM. The batched GEMM could be faster base on the
+ * implementation of the blas library. The batch size could be zero. If any
+ * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
+ * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
+ * [BatchSize, H1, W2]
+ *
+ * The boolean flag, `trans`, describe the memory is the transpose of matrix or
+ * not. If the trans is true, the last two dims of matrix are transposed. The
+ * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
+ *
+ * The MatDescriptor is not only the dimension or shape of a matrix, it also
+ * contains the layout, stride of matrix. It is clearer to have a structure than
+ * reuse `DDim`.
+ */
+struct MatDescriptor {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+
+static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
+                                            int num_flatten_cols,
+                                            bool trans) {
+  MatDescriptor retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (tensor_dim.size() == 2) {
+      retv.height_ = tensor_dim[0];
+      retv.width_ = tensor_dim[1];
+    } else {
+      auto dim_vec = tensor_dim.Vectorize();
+      retv.batch_size_ = 1;
+      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+        retv.batch_size_ *= dim_vec[i];
+      }
+      retv.height_ = dim_vec[dim_vec.size() - 2];
+      retv.width_ = dim_vec[dim_vec.size() - 1];
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
+
+}  // namespace math
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5dcbc1e275cca8c32003cbef74dfb1f6d4caee93
--- /dev/null
+++ b/lite/backends/xpu/target_wrapper.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/target_wrapper.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+
+namespace paddle {
+namespace lite {
+
+void* TargetWrapperXPU::Malloc(size_t size) {
+  void* ptr{nullptr};
+  xpu_malloc(&ptr, size);
+  return ptr;
+}
+
+void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
+
+void TargetWrapperXPU::MemcpySync(void* dst,
+                                  const void* src,
+                                  size_t size,
+                                  IoDirection dir) {
+  switch (dir) {
+    case IoDirection::HtoD:
+      xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
+      break;
+    case IoDirection::DtoH:
+      xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..c42d4139246085d8b9a367b45b60699209d0b668
--- /dev/null
+++ b/lite/backends/xpu/target_wrapper.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+
+using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
+
+template <>
+class TargetWrapper<TARGET(kXPU)> {
+ public:
+  static size_t num_devices() { return 1; }
+  static size_t maximum_stream() { return 0; }
+
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/xpu_header_sitter.h b/lite/backends/xpu/xpu_header_sitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..875e67d57d4ba2110bfbffb7ee9d1d6a876060fa
--- /dev/null
+++ b/lite/backends/xpu/xpu_header_sitter.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#pragma GCC system_header
+#include <xpu/api.h>
+#include <xpu/golden.h>
+#include <xpu/runtime.h>
+
+#if defined(LITE_WITH_XTCL)
+#include <xtcl/xtcl.h>
+#endif
+
+namespace paddle {
+namespace lite {
+
+namespace xdnn = baidu::xpu::api;
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index 35aad501070282b49cdd8df72185ad9d21dab9fe..6bd353a9e13bdfbd1fce0291e04f4b5925b18ac1 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -5,6 +5,7 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
   DEPS target_wrapper_host place
   X86_DEPS target_wrapper_x86
   CUDA_DEPS target_wrapper_cuda
+  XPU_DEPS target_wrapper_xpu
   CL_DEPS cl_target_wrapper
   FPGA_DEPS fpga_target_wrapper
   BM_DEPS target_wrapper_bm
@@ -37,7 +38,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
 if (LITE_WITH_ARM)
 lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context)
 else()
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context CUDA_DEPS cuda_context)
 endif()
 
 #-------------------------------------------- GET CODE META INFO ------------------------------------------
diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt
index afc104073684ff00395fb32335630705ff3f7bc8..75971570fb078ce4e39413e5b3df629fe2a7ac3e 100644
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/core/context.cc b/lite/core/context.cc
index be886168e02e21d192305d701110ce5075ffba63..be41aa6eb0cb986760f38eaa2bb5b7e017cc4edb 100644
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -15,5 +15,11 @@
 #include "lite/core/context.h"
 
 namespace paddle {
-namespace lite {}  // namespace lite
+namespace lite {
+
+#ifdef LITE_WITH_XPU
+thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
+#endif
+
+}  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/context.h b/lite/core/context.h
index 6b826fe46f973d9812d76802a48b6d63f16b5081..7ab45bae1d3b3ff518ffa7a1db61cd1f56c92728 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -16,8 +16,7 @@
 
 #include "lite/utils/any.h"
 #ifdef LITE_WITH_CUDA
-#include "lite/backends/cuda/blas.h"
-#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/context.h"
 #endif
 #ifdef LITE_WITH_OPENCL
 #include <unordered_map>
@@ -29,6 +28,9 @@
 #include <cnrt.h>
 #include "lite/backends/mlu/mlu_utils.h"
 #endif
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#endif
 
 #include <map>
 #include <memory>
@@ -50,7 +52,6 @@ class Context;
 
 using HostContext = Context<TargetType::kHost>;
 using X86Context = Context<TargetType::kX86>;
-using CUDAContext = Context<TargetType::kCUDA>;
 using ARMContext = Context<TargetType::kARM>;
 using NPUContext = Context<TargetType::kNPU>;
 using XPUContext = Context<TargetType::kXPU>;
@@ -58,6 +59,7 @@ using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
 using MLUContext = Context<TargetType::kMLU>;
+using RKNPUContext = Context<TargetType::kRKNPU>;
 
 template <>
 class Context<TargetType::kHost> {
@@ -102,17 +104,59 @@ class Context<TargetType::kBM> {
 };
 #endif
 
+#ifdef LITE_WITH_RKNPU
+template <>
+class Context<TargetType::kRKNPU> {
+ public:
+  Context() {}
+  explicit Context(const RKNPUContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(RKNPUContext* ctx) {}
+
+  RKNPUContext& operator=(const RKNPUContext& ctx) {}
+  std::string name() const { return "RKNPUContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_XPU
 template <>
 class Context<TargetType::kXPU> {
  public:
   Context() {}
   explicit Context(const XPUContext& ctx);
+
   // NOTE: InitOnce should only be used by ContextScheduler
   void InitOnce() {}
+
   void CopySharedTo(XPUContext* ctx) {}
 
+  static xdnn::Context* GetRawContext() {
+    if (_tls_raw_ctx == nullptr) {
+      _tls_raw_ctx = xdnn::create_context();
+      CHECK(_tls_raw_ctx);
+    }
+    return _tls_raw_ctx;
+  }
+
+  static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
+    xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
+  }
+
+  static void SetDev(int dev_no = 0) {
+    const char* dev_env = getenv("LITE_XPU_DEV");
+    if (dev_env) {
+      xpu_set_device(atoi(dev_env));
+      return;
+    }
+
+    xpu_set_device(dev_no);
+  }
+
   std::string name() const { return "XPUContext"; }
+
+ private:
+  static thread_local xdnn::Context* _tls_raw_ctx;
 };
 #endif
 
@@ -227,12 +271,10 @@ class Context<TargetType::kMLU> {
   void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
 
   cnmlCoreVersion_t MLUCoreVersion() {
-    return paddle::lite::TargetWrapperMlu::MLUCoreVersion();
+    return DeviceInfo::Global().MLUCoreVersion();
   }
 
-  int MLUCoreNumber() {
-    return paddle::lite::TargetWrapperMlu::MLUCoreNumber();
-  }
+  int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
 
   u32_t affinity() { return affinity_; }
 
@@ -258,99 +300,6 @@ class Context<TargetType::kMLU> {
 };
 #endif  // LITE_WITH_MLU
 
-#ifdef LITE_WITH_CUDA
-// Only works with CUDA kernels.
-template <>
-class Context<TargetType::kCUDA> {
- public:
-  typename Env<TargetType::kCUDA>::Devs& devs =
-      Env<TargetType::kCUDA>::Global();
-  // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() {
-    cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
-  }
-  void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
-    CHECK_GT(devs.size(), 0UL)
-        << "Env is not initialized or current target is not exit!";
-    if (dev_id >= static_cast<int>(devs.size())) {
-      LOG(WARNING) << "device index exceeds the number of devices, set to "
-                      "default device(0)!";
-      device_id_ = 0;
-    } else {
-      device_id_ = dev_id;
-    }
-    if (io_stream_id >= devs[dev_id].max_stream()) {
-      LOG(WARNING) << "data stream index exceeds the maximum stream number, "
-                      "set to default stream(0)!";
-      io_stream_id = 0;
-    }
-    if (exec_stream_id >= devs[dev_id].max_stream()) {
-      LOG(WARNING) << "exec stream index exceeds the maximum stream number, "
-                      "set to default stream(0)!";
-      exec_stream_id = 0;
-    }
-
-    exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id];
-    io_stream_ = devs[dev_id].io_streams()[io_stream_id];
-
-    exec_stream_id_ = exec_stream_id;
-    io_stream_id_ = io_stream_id;
-  }
-  void CopySharedTo(CUDAContext* ctx) {
-    CHECK(ctx);
-    CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
-    ctx->cublas_fp32_ = cublas_fp32_;
-  }
-
-  const cudaStream_t& exec_stream() const { return exec_stream_; }
-  void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
-
-  const cudaStream_t& io_stream() const { return io_stream_; }
-  void SetIoStream(cudaStream_t stream) { io_stream_ = stream; }
-
-  std::shared_ptr<cuda::Blas<float>> cublas_fp32() { return cublas_fp32_; }
-  void SetCuBlasFP32(std::shared_ptr<cuda::Blas<float>> cublas_fp32) {
-    cublas_fp32_ = cublas_fp32;
-  }
-
-  const std::vector<cudaEvent_t>& input_events() { return input_events_; }
-  void SetInputEvents(const std::vector<cudaEvent_t>& input_events) {
-    input_events_.clear();
-    input_events_.assign(input_events.begin(), input_events.end());
-  }
-
-  const std::vector<cudaEvent_t>& output_events() { return output_events_; }
-  void SetOutputEvents(const std::vector<cudaEvent_t>& output_events) {
-    output_events_.clear();
-    output_events_.assign(output_events.begin(), output_events.end());
-  }
-
-  std::string name() const { return "CUDAContext"; }
-
-  CUDAContext& operator=(const CUDAContext& context) {
-    this->Init(
-        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
-    cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
-    return *this;
-  }
-
- private:
-  int device_id_;
-  // overall information
-  int exec_stream_id_;
-  int io_stream_id_;
-  cudaStream_t exec_stream_;
-  cudaStream_t io_stream_;
-
-  // not thread-safe, should allocate for each thread.
-  std::shared_ptr<cuda::Blas<float>> cublas_fp32_;
-
-  // kernel information
-  std::vector<cudaEvent_t> input_events_;
-  std::vector<cudaEvent_t> output_events_;
-};
-#endif
-
 #ifdef LITE_WITH_X86
 template <>
 class Context<TargetType::kX86> {
@@ -423,7 +372,9 @@ class ContextScheduler {
     return *x;
   }
 
-  std::unique_ptr<KernelContext> NewContext(TargetType target) {
+  std::unique_ptr<KernelContext> NewContext(
+      TargetType target,
+      /*only used for cuda context*/ int exec_stream_id = 0) {
     std::unique_ptr<KernelContext> ctx(new KernelContext);
     switch (target) {
       case TARGET(kHost):
@@ -440,7 +391,7 @@ class ContextScheduler {
       case TARGET(kCUDA): {
         int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
         auto& context = ctx->As<CUDAContext>();
-        context.Init(dev_id);
+        context.Init(dev_id, exec_stream_id);
         kernel_contexts_[TargetType::kCUDA].As<CUDAContext>().CopySharedTo(
             &context);
       } break;
@@ -457,6 +408,12 @@ class ContextScheduler {
             &ctx->As<NPUContext>());
         break;
 #endif
+#ifdef LITE_WITH_RKNPU
+      case TARGET(kRKNPU):
+        kernel_contexts_[TargetType::kRKNPU].As<RKNPUContext>().CopySharedTo(
+            &ctx->As<RKNPUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_XPU
       case TARGET(kXPU):
         kernel_contexts_[TargetType::kXPU].As<XPUContext>().CopySharedTo(
@@ -526,6 +483,9 @@ class ContextScheduler {
 #ifdef LITE_WITH_NPU
     InitContext<TargetType::kNPU, NPUContext>();
 #endif
+#ifdef LITE_WITH_RKNPU
+    InitContext<TargetType::kRKNPU, RKNPUContext>();
+#endif
 #ifdef LITE_WITH_XPU
     InitContext<TargetType::kXPU, XPUContext>();
 #endif
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index 6d856b91888e652568fdae0452345e4dadaa069c..29ac96ed744b016833a746b35002dd68109efd8b 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -66,6 +66,15 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
 thread_local TensorLite DeviceInfo::workspace_;
 thread_local int64_t DeviceInfo::count_ = 0;
 
+#ifdef LITE_WITH_MLU
+thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
+thread_local int DeviceInfo::mlu_core_number_{1};
+thread_local bool DeviceInfo::use_first_conv_{false};
+thread_local std::vector<float> DeviceInfo::mean_vec_;
+thread_local std::vector<float> DeviceInfo::std_vec_;
+thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
+#endif
+
 #ifdef TARGET_IOS
 const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
 const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
@@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() {
   return 0;
 }
 
+#ifdef LITE_WITH_MLU
+void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                               int core_number,
+                               bool use_first_conv,
+                               const std::vector<float>& mean_vec,
+                               const std::vector<float>& std_vec,
+                               DataLayoutType input_layout) {
+  switch (core_version) {
+    case (lite_api::MLUCoreVersion::MLU_220):
+      mlu_core_version_ = CNML_MLU220;
+      break;
+    case (lite_api::MLUCoreVersion::MLU_270):
+      mlu_core_version_ = CNML_MLU270;
+      break;
+    default:
+      mlu_core_version_ = CNML_MLU270;
+      break;
+  }
+  mlu_core_number_ = core_number;
+  use_first_conv_ = use_first_conv;
+  mean_vec_ = mean_vec;
+  std_vec_ = std_vec;
+  input_layout_ = input_layout;
+}
+
+cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
+
+int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
+
+bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
+
+const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
+
+const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
+
+DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
+
+#endif  // LITE_WITH_MLU
+
 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
   thread_num = std::min(thread_num, core_num_);
diff --git a/lite/core/device_info.h b/lite/core/device_info.h
index 5f5c4259e9614e74e04f61983031abf32a5a1621..b06eb8d944735971133bb7a29aa0f06075e60626 100644
--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -55,6 +55,20 @@ class DeviceInfo {
   int Setup();
 
   void SetRunMode(lite_api::PowerMode mode, int thread_num);
+#ifdef LITE_WITH_MLU
+  void SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                     int core_number,
+                     bool use_first_conv,
+                     const std::vector<float>& mean_vec,
+                     const std::vector<float>& std_vec,
+                     DataLayoutType input_layout);
+  cnmlCoreVersion_t MLUCoreVersion();
+  int MLUCoreNumber();
+  bool UseFirstConv();
+  const std::vector<float>& MeanVec() const;
+  const std::vector<float>& StdVec() const;
+  DataLayoutType InputLayout() const;
+#endif
   void SetCache(int l1size, int l2size, int l3size);
   void SetArch(ARMArch arch) { arch_ = arch; }
 
@@ -106,6 +120,15 @@ class DeviceInfo {
   static thread_local TensorLite workspace_;
   static thread_local int64_t count_;
 
+#ifdef LITE_WITH_MLU
+  static thread_local cnmlCoreVersion_t mlu_core_version_;
+  static thread_local int mlu_core_number_;
+  static thread_local bool use_first_conv_;
+  static thread_local std::vector<float> mean_vec_;
+  static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
+#endif
+
   void SetDotInfo(int argc, ...);
   void SetFP16Info(int argc, ...);
   void SetFP32Info(int argc, ...);
@@ -136,7 +159,7 @@ class Env {
     static Devs* devs = new Devs();
     return *devs;
   }
-  static void Init(int max_stream = 4) {
+  static void Init(int max_stream = 6) {
 #ifdef LITE_WITH_MLU
     CNRT_CALL(cnrtInit(0));
 #endif
@@ -148,10 +171,11 @@ class Env {
     // Get device count
     count = API::num_devices();
     if (count == 0) {
-      CHECK(false) << "No device found!";
+      LOG(INFO) << "No " << TargetToStr(Type) << " device(s) found!";
     } else {
       LOG(INFO) << "Found " << count << " device(s)";
     }
+    CHECK_GT(max_stream, 0) << "max_stream must be greater than 0.";
     // create all device
     for (int i = 0; i < count; i++) {
       auto dev = Device<Type>(i, max_stream);
@@ -211,8 +235,8 @@ class Device<TARGET(kCUDA)> {
   std::string name() { return device_prop_.name; }
   int core_num() { return device_prop_.multiProcessorCount; }
   float max_memory() { return device_prop_.totalGlobalMem / 1048576.; }
-  std::vector<cudaStream_t> exec_streams() { return exec_stream_; }
-  std::vector<cudaStream_t> io_streams() { return io_stream_; }
+  const std::vector<cudaStream_t>& exec_streams() { return exec_stream_; }
+  const std::vector<cudaStream_t>& io_streams() { return io_stream_; }
 
   int sm_version() { return sm_version_; }
   bool has_fp16() { return has_fp16_; }
diff --git a/lite/core/memory.cc b/lite/core/memory.cc
index dfe07cdc4b2750893815ed64445fe7672dcdb6c8..1f2f7fed7d61b67a76f54a092b6d48951bc9fcbd 100644
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -50,13 +50,18 @@ void* TargetMalloc(TargetType target, size_t size) {
       data = TargetWrapper<TARGET(kMLU)>::Malloc(size);
       break;
 #endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      data = TargetWrapperXPU::Malloc(size);
+      break;
+#endif  // LITE_WITH_XPU
     default:
       LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
   }
   return data;
 }
 
-void TargetFree(TargetType target, void* data) {
+void TargetFree(TargetType target, void* data, std::string free_flag) {
   switch (target) {
     case TargetType::kHost:
     case TargetType::kX86:
@@ -71,7 +76,11 @@ void TargetFree(TargetType target, void* data) {
 #endif  // LITE_WITH_CUDA
 #ifdef LITE_WITH_OPENCL
     case TargetType::kOpenCL:
-      TargetWrapperCL::Free(data);
+      if (free_flag == "cl_use_image2d_") {
+        TargetWrapperCL::FreeImage(data);
+      } else {
+        TargetWrapperCL::Free(data);
+      }
       break;
 #endif  // LITE_WITH_OPENCL
 #ifdef LITE_WITH_FPGA
@@ -89,6 +98,11 @@ void TargetFree(TargetType target, void* data) {
       TargetWrapper<TARGET(kMLU)>::Free(data);
       break;
 #endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      TargetWrapperXPU::Free(data);
+      break;
+#endif  // LITE_WITH_XPU
     default:
       LOG(FATAL) << "Unknown type";
   }
diff --git a/lite/core/memory.h b/lite/core/memory.h
index 5a56f73b0de0fce64905f483ded88eda9ceffd52..a1013910019251271ddfccfbc700297c45226fe6 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <string>
 #include "lite/api/paddle_place.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/utils/logging.h"
@@ -34,6 +35,10 @@
 #include "lite/backends/mlu/target_wrapper.h"
 #endif  // LITE_WITH_MLU
 
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif  // LITE_WITH_XPU
+
 namespace paddle {
 namespace lite {
 
@@ -43,7 +48,9 @@ LITE_API void* TargetMalloc(TargetType target, size_t size);
 
 // Free memory for a specific Target. All the targets should be an element in
 // the `switch` here.
-void LITE_API TargetFree(TargetType target, void* data);
+void LITE_API TargetFree(TargetType target,
+                         void* data,
+                         std::string free_flag = "");
 
 // Copy a buffer from host to another target.
 void TargetCopy(TargetType target, void* dst, const void* src, size_t size);
@@ -117,6 +124,9 @@ class Buffer {
       data_ = TargetMalloc(target, size);
       target_ = target;
       space_ = size;
+#ifdef LITE_WITH_OPENCL
+      cl_use_image2d_ = false;
+#endif
     }
   }
 
@@ -128,15 +138,15 @@ class Buffer {
                         const size_t img_w,
                         const size_t img_h,
                         void* host_ptr = nullptr) {
-    size_t size = sizeof(T) * img_w * img_h *
-                  4;  // 4 for RGBA, un-used for opencl Image2D
     if (target != target_ || cl_image2d_width_ < img_w ||
-        cl_image2d_height_ < img_h) {
+        cl_image2d_height_ < img_h || host_ptr != nullptr) {
       CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
       Free();
       data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);
       target_ = target;
-      space_ = size;  // un-used for opencl Image2D
+      space_ = sizeof(T) * img_w * img_h *
+               4;  // un-used for opencl Image2D, 4 for RGBA,
+      cl_use_image2d_ = true;
       cl_image2d_width_ = img_w;
       cl_image2d_height_ = img_h;
     }
@@ -145,7 +155,11 @@ class Buffer {
 
   void Free() {
     if (space_ > 0 && own_data_) {
-      TargetFree(target_, data_);
+      if (!cl_use_image2d_) {
+        TargetFree(target_, data_);
+      } else {
+        TargetFree(target_, data_, "cl_use_image2d_");
+      }
     }
     data_ = nullptr;
     target_ = TargetType::kHost;
@@ -164,6 +178,7 @@ class Buffer {
  private:
   // memory it actually malloced.
   size_t space_{0};
+  bool cl_use_image2d_{false};   // only used for OpenCL Image2D
   size_t cl_image2d_width_{0};   // only used for OpenCL Image2D
   size_t cl_image2d_height_{0};  // only used for OpenCL Image2D
   void* data_{nullptr};
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index 8ee2a3fcd94a527b43836b69f731577dabee6ed3..d036bf7988b98e64586e42683d33b4696e9ff706 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -21,6 +21,8 @@ lite_cc_library(mir_passes
       fusion/elementwise_add_activation_fuse_pass.cc
       fusion/quant_dequant_fuse_pass.cc
       fusion/sequence_pool_concat_fuse_pass.cc
+      fusion/__xpu__resnet_fuse_pass.cc
+      fusion/__xpu__multi_encoder_fuse_pass.cc
       elimination/identity_scale_eliminate_pass.cc
       elimination/elementwise_mul_constant_eliminate_pass.cc
       static_kernel_pick_pass.cc
@@ -35,8 +37,8 @@ lite_cc_library(mir_passes
       demo_pass.cc
       runtime_context_assign_pass.cc
       memory_optimize_pass.cc
+      multi_stream_analysis_pass.cc
       mlu_postprocess_pass.cc
-      subgraph_cast_display_pass.cc
       weight_quantization_preprocess_pass.cc
       quantized_op_attributes_inference_pass.cc
   DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
@@ -71,10 +73,10 @@ set(pattern_deps mir_node mir_ssa_graph op)
 if (WITH_TESTING)
   list(APPEND pattern_deps gtest)
 endif()
-lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps})
+lite_cc_library(pattern_matcher SRCS pattern_matcher.cc xpu_pattern_matcher.cc DEPS ${pattern_deps})
 lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher)
 
-lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher)
+lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc xpu_pattern_matcher_high_api.cc DEPS pattern_matcher)
 
 
 # for mobile, unnecessary to compile the following testings.
diff --git a/lite/core/mir/dot.h b/lite/core/mir/dot.h
index df70565c0775acdb61cb540598f15b7f84e0119c..a68890910ab33bd32c68efc6f06236db21909a05 100644
--- a/lite/core/mir/dot.h
+++ b/lite/core/mir/dot.h
@@ -27,8 +27,8 @@
 #include "lite/utils/string.h"
 
 namespace paddle {
-namespace inference {
-namespace analysis {
+namespace lite {
+namespace mir {
 
 static size_t dot_node_counter{0};
 
@@ -162,6 +162,6 @@ class Dot {
   std::vector<Attr> attrs_;
 };
 
-}  // namespace analysis
-}  // namespace inference
+}  // namespace mir
+}  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt
index e65e72cf7b367ee8477f3f783ae4d82372529864..04a36976c7110c64ef781af12fc86fd4853fe583 100644
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -27,10 +27,10 @@ lite_cc_library(fuse_transpose_softmax_transpose
         DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_interpolate
         SRCS interpolate_fuser.cc
-        DEPS pattern_matcher_high_api)       
+        DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_sequence_pool_concat
         SRCS sequence_pool_concat_fuser.cc
-        DEPS pattern_matcher_high_api)       
+        DEPS pattern_matcher_high_api)
 
 set(mir_fusers
     fuse_fc
diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..655274070f1ffcccf39b5f3ff6aaa705c5cbbfda
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -0,0 +1,637 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/operators/subgraph_op.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+namespace fusion {
+
+class XPUSingleEncoderFuser : public FuseBase {
+ public:
+  explicit XPUSingleEncoderFuser(const std::string& act_type = "gelu")
+      : act_type_(act_type) {}
+
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("mul", "X")
+                      ->assert_is_op_input("elementwise_add", "Y")
+                      ->AsInput();
+
+    auto* q_mul_y =
+        VarNode("q_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* q_mul = OpNode("q_mul", "mul");
+    auto* q_mul_out = VarNode("q_mul_out")
+                          ->assert_is_op_output("mul", "Out")
+                          ->assert_is_op_input("elementwise_add", "X")
+                          ->AsIntermediate();
+    auto* q_add_y = VarNode("q_add_y")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* q_add = OpNode("q_add", "elementwise_add")->AsIntermediate();
+    auto* q_add_out = VarNode("q_add_out")
+                          ->assert_is_op_output("elementwise_add", "Out")
+                          ->assert_is_op_input("reshape2", "X")
+                          ->AsIntermediate();
+    auto* q_reshape2 = OpNode("q_reshape2", "reshape2")->AsIntermediate();
+    auto* q_reshape2_out = VarNode("q_reshape2_out")
+                               ->assert_is_op_output("reshape2", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* q_reshape2_xshape = VarNode("q_reshape2_xshape")
+                                  ->assert_is_op_output("reshape2", "XShape")
+                                  ->AsIntermediate();
+    auto* q_transpose2 = OpNode("q_transpose2", "transpose2")->AsIntermediate();
+    auto* q_transpose2_out = VarNode("q_transpose2_out")
+                                 ->assert_is_op_output("transpose2", "Out")
+                                 ->assert_is_op_input("scale", "X")
+                                 ->AsIntermediate();
+    auto* q_transpose2_xshape =
+        VarNode("q_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+    auto* q_scale = OpNode("q_scale", "scale")->AsIntermediate();
+    auto* q_scale_out = VarNode("q_scale_out")
+                            ->assert_is_op_output("scale", "Out")
+                            ->assert_is_op_input("matmul", "X")
+                            ->AsIntermediate();
+
+    auto* k_mul_y =
+        VarNode("k_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* k_mul = OpNode("k_mul", "mul")->AsIntermediate();
+    auto* k_mul_out = VarNode("k_mul_out")
+                          ->assert_is_op_output("mul", "Out")
+                          ->assert_is_op_input("elementwise_add", "X")
+                          ->AsIntermediate();
+    auto* k_add_y = VarNode("k_add_y")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* k_add = OpNode("k_add", "elementwise_add")->AsIntermediate();
+    auto* k_add_out = VarNode("k_add_out")
+                          ->assert_is_op_output("elementwise_add", "Out")
+                          ->assert_is_op_input("reshape2", "X")
+                          ->AsIntermediate();
+    auto* k_reshape2 = OpNode("k_reshape2", "reshape2")->AsIntermediate();
+    auto* k_reshape2_out = VarNode("k_reshape2_out")
+                               ->assert_is_op_output("reshape2", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* k_reshape2_xshape = VarNode("k_reshape2_xshape")
+                                  ->assert_is_op_output("reshape2", "XShape")
+                                  ->AsIntermediate();
+    auto* k_transpose2 = OpNode("k_transpose2", "transpose2")->AsIntermediate();
+    auto* k_transpose2_out = VarNode("k_transpose2_out")
+                                 ->assert_is_op_output("transpose2", "Out")
+                                 ->assert_is_op_input("matmul", "Y")
+                                 ->AsIntermediate();
+    auto* k_transpose2_xshape =
+        VarNode("k_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+
+    auto* qk_matmul = OpNode("qk_matmul", "matmul")->AsIntermediate();
+    auto* qk_matmul_out = VarNode("qk_matmul_out")
+                              ->assert_is_op_output("matmul", "Out")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* qk_mask = VarNode("qk_mask")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* qk_add = OpNode("qk_add", "elementwise_add")->AsIntermediate();
+    auto* qk_add_out = VarNode("qk_add_out")
+                           ->assert_is_op_output("elementwise_add", "Out")
+                           ->assert_is_op_input("softmax", "X")
+                           ->AsIntermediate();
+    auto* qk_softmax = OpNode("qk_softmax", "softmax")->AsIntermediate();
+    auto* qk_softmax_out = VarNode("qk_softmax_out")
+                               ->assert_is_op_output("softmax", "Out")
+                               ->AsIntermediate();
+    auto* qk_dropout = OpNode("qk_dropout", "dropout")->AsIntermediate();
+    auto* qk_dropout_out = VarNode("qk_dropout_out")
+                               ->assert_is_op_output("dropout", "Out")
+                               ->assert_is_op_input("matmul", "X")
+                               ->AsIntermediate();
+    auto* qk_dropout_mask = VarNode("qk_dropout_mask")
+                                ->assert_is_op_output("dropout", "Mask")
+                                ->AsIntermediate();
+
+    auto* v_mul_y =
+        VarNode("v_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* v_mul = OpNode("v_mul", "mul")->AsIntermediate();
+    auto* v_mul_out = VarNode("v_mul_out")
+                          ->assert_is_op_output("mul", "Out")
+                          ->assert_is_op_input("elementwise_add", "X")
+                          ->AsIntermediate();
+    auto* v_add_y = VarNode("v_add_y")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* v_add = OpNode("v_add", "elementwise_add")->AsIntermediate();
+    auto* v_add_out = VarNode("v_add_out")
+                          ->assert_is_op_output("elementwise_add", "Out")
+                          ->assert_is_op_input("reshape2", "X")
+                          ->AsIntermediate();
+    auto* v_reshape2 = OpNode("v_reshape2", "reshape2")->AsIntermediate();
+    auto* v_reshape2_out = VarNode("v_reshape2_out")
+                               ->assert_is_op_output("reshape2", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* v_reshape2_xshape = VarNode("v_reshape2_xshape")
+                                  ->assert_is_op_output("reshape2", "XShape")
+                                  ->AsIntermediate();
+    auto* v_transpose2 = OpNode("v_transpose2", "transpose2")->AsIntermediate();
+    auto* v_transpose2_out = VarNode("v_transpose2_out")
+                                 ->assert_is_op_output("transpose2", "Out")
+                                 ->assert_is_op_input("matmul", "Y")
+                                 ->AsIntermediate();
+    auto* v_transpose2_xshape =
+        VarNode("v_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+
+    auto* qkv_matmul = OpNode("qkv_matmul", "matmul")->AsIntermediate();
+    auto* qkv_matmul_out = VarNode("qkv_matmul_out")
+                               ->assert_is_op_output("matmul", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* qkv_transpose2 =
+        OpNode("qkv_transpose2", "transpose2")->AsIntermediate();
+    auto* qkv_transpose2_out = VarNode("qkv_transpose2_out")
+                                   ->assert_is_op_output("transpose2", "Out")
+                                   ->assert_is_op_input("reshape2", "X")
+                                   ->AsIntermediate();
+    auto* qkv_transpose2_xshape =
+        VarNode("qkv_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+    auto* qkv_reshape2 = OpNode("qkv_reshape2", "reshape2")->AsIntermediate();
+    auto* qkv_reshape2_out = VarNode("qkv_reshape2_out")
+                                 ->assert_is_op_output("reshape2", "Out")
+                                 ->assert_is_op_input("mul", "X")
+                                 ->AsIntermediate();
+    auto* qkv_reshape2_xshape = VarNode("qkv_reshape2_xshape")
+                                    ->assert_is_op_output("reshape2", "XShape")
+                                    ->AsIntermediate();
+    auto* qkv_mul_y =
+        VarNode("qkv_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* qkv_mul = OpNode("qkv_mul", "mul")->AsIntermediate();
+    auto* qkv_mul_out = VarNode("qkv_mul_out")
+                            ->assert_is_op_output("mul", "Out")
+                            ->assert_is_op_input("elementwise_add", "X")
+                            ->AsIntermediate();
+    auto* qkv_add_y = VarNode("qkv_add_y")
+                          ->assert_is_op_input("elementwise_add", "Y")
+                          ->AsInput();
+    auto* qkv_add = OpNode("qkv_add", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_out = VarNode("qkv_add_out")
+                            ->assert_is_op_output("elementwise_add", "Out")
+                            ->assert_is_op_input("dropout", "X")
+                            ->AsIntermediate();
+    auto* qkv_dropout = OpNode("qkv_dropout", "dropout")->AsIntermediate();
+    auto* qkv_dropout_out = VarNode("qkv_dropout_out")
+                                ->assert_is_op_output("dropout", "Out")
+                                ->assert_is_op_input("elementwise_add", "X")
+                                ->AsIntermediate();
+    auto* qkv_dropout_mask = VarNode("qkv_dropout_mask")
+                                 ->assert_is_op_output("dropout", "Mask")
+                                 ->AsIntermediate();
+
+    auto* qkv_add_2 = OpNode("qkv_add_2", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_2_out = VarNode("qkv_add_2_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input("layer_norm", "X")
+                              ->AsIntermediate();
+    auto* qkv_ln_2_scale = VarNode("qkv_ln_2_scale")
+                               ->assert_is_op_input("layer_norm", "Scale")
+                               ->AsInput();
+    auto* qkv_ln_2_bias = VarNode("qkv_ln_2_bias")
+                              ->assert_is_op_input("layer_norm", "Bias")
+                              ->AsInput();
+    auto* qkv_ln_2 = OpNode("qkv_ln_2", "layer_norm")->AsIntermediate();
+    auto* qkv_ln_2_out = VarNode("qkv_ln_2_out")
+                             ->assert_is_op_output("layer_norm", "Y")
+                             ->assert_is_op_input("mul", "X")
+                             ->assert_is_op_input("elementwise_add", "Y")
+                             ->AsIntermediate();
+    auto* qkv_ln_2_mean = VarNode("qkv_ln_2_mean")
+                              ->assert_is_op_output("layer_norm", "Mean")
+                              ->AsIntermediate();
+    auto* qkv_ln_2_var = VarNode("qkv_ln_2_var")
+                             ->assert_is_op_output("layer_norm", "Variance")
+                             ->AsIntermediate();
+
+    auto* qkv_mul_3_y =
+        VarNode("qkv_mul_3_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* qkv_mul_3 = OpNode("qkv_mul_3", "mul")->AsIntermediate();
+    auto* qkv_mul_3_out = VarNode("qkv_mul_3_out")
+                              ->assert_is_op_output("mul", "Out")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* qkv_add_3_y = VarNode("qkv_add_3_y")
+                            ->assert_is_op_input("elementwise_add", "Y")
+                            ->AsInput();
+    auto* qkv_add_3 = OpNode("qkv_add_3", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_3_out = VarNode("qkv_add_3_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input(act_type_, "X")
+                              ->AsIntermediate();
+    auto* qkv_act = OpNode("qkv_act", act_type_)->AsIntermediate();
+    auto* qkv_act_out = VarNode("qkv_act_out")
+                            ->assert_is_op_output(act_type_, "Out")
+                            ->assert_is_op_input("mul", "X")
+                            ->AsIntermediate();
+    auto* qkv_mul_4_y =
+        VarNode("qkv_mul_4_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* qkv_mul_4 = OpNode("qkv_mul_4", "mul")->AsIntermediate();
+    auto* qkv_mul_4_out = VarNode("qkv_mul_4_out")
+                              ->assert_is_op_output("mul", "Out")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* qkv_add_4_y = VarNode("qkv_add_4_y")
+                            ->assert_is_op_input("elementwise_add", "Y")
+                            ->AsInput();
+    auto* qkv_add_4 = OpNode("qkv_add_4", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_4_out = VarNode("qkv_add_4_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input("dropout", "X")
+                              ->AsIntermediate();
+    auto* qkv_dropout_4 = OpNode("qkv_dropout_4", "dropout")->AsIntermediate();
+    auto* qkv_dropout_4_out = VarNode("qkv_dropout_4_out")
+                                  ->assert_is_op_output("dropout", "Out")
+                                  ->assert_is_op_input("elementwise_add", "X")
+                                  ->AsIntermediate();
+    auto* qkv_dropout_4_mask = VarNode("qkv_dropout_4_mask")
+                                   ->assert_is_op_output("dropout", "Mask")
+                                   ->AsIntermediate();
+
+    auto* qkv_add_5 = OpNode("qkv_add_5", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_5_out = VarNode("qkv_add_5_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input("layer_norm", "X")
+                              ->AsIntermediate();
+    auto* qkv_ln_5_scale = VarNode("qkv_ln_5_scale")
+                               ->assert_is_op_input("layer_norm", "Scale")
+                               ->AsInput();
+    auto* qkv_ln_5_bias = VarNode("qkv_ln_5_bias")
+                              ->assert_is_op_input("layer_norm", "Bias")
+                              ->AsInput();
+    auto* qkv_ln_5 = OpNode("qkv_ln_5", "layer_norm")->AsIntermediate();
+    auto* qkv_ln_5_out = VarNode("qkv_ln_5_out")
+                             ->assert_is_op_output("layer_norm", "Y")
+                             ->AsOutput();
+    auto* qkv_ln_5_mean = VarNode("qkv_ln_5_mean")
+                              ->assert_is_op_output("layer_norm", "Mean")
+                              ->AsIntermediate();
+    auto* qkv_ln_5_var = VarNode("qkv_ln_5_var")
+                             ->assert_is_op_output("layer_norm", "Variance")
+                             ->AsIntermediate();
+
+    // TODO(miaotianxiang): use LinksFrom/LinksTo() instead
+    *input >> *q_mul >> *q_mul_out >> *q_add >> *q_add_out >> *q_reshape2 >>
+        *q_reshape2_out >> *q_transpose2 >> *q_transpose2_out >> *q_scale >>
+        *q_scale_out >> *qk_matmul;
+    *q_mul_y >> *q_mul;
+    *q_add_y >> *q_add;
+    *q_reshape2 >> *q_reshape2_xshape;
+    *q_transpose2 >> *q_transpose2_xshape;
+
+    *input >> *k_mul >> *k_mul_out >> *k_add >> *k_add_out >> *k_reshape2 >>
+        *k_reshape2_out >> *k_transpose2 >> *k_transpose2_out >> *qk_matmul;
+    *k_mul_y >> *k_mul;
+    *k_add_y >> *k_add;
+    *k_reshape2 >> *k_reshape2_xshape;
+    *k_transpose2 >> *k_transpose2_xshape;
+
+    *qk_matmul >> *qk_matmul_out >> *qk_add >> *qk_add_out >> *qk_softmax >>
+        *qk_softmax_out >> *qk_dropout >> *qk_dropout_out >> *qkv_matmul;
+    *qk_mask >> *qk_add;
+    *qk_dropout >> *qk_dropout_mask;
+
+    *input >> *v_mul >> *v_mul_out >> *v_add >> *v_add_out >> *v_reshape2 >>
+        *v_reshape2_out >> *v_transpose2 >> *v_transpose2_out >> *qkv_matmul;
+    *v_mul_y >> *v_mul;
+    *v_add_y >> *v_add;
+    *v_reshape2 >> *v_reshape2_xshape;
+    *v_transpose2 >> *v_transpose2_xshape;
+
+    *qkv_matmul >> *qkv_matmul_out >> *qkv_transpose2 >> *qkv_transpose2_out >>
+        *qkv_reshape2 >> *qkv_reshape2_out >> *qkv_mul >> *qkv_mul_out >>
+        *qkv_add >> *qkv_add_out >> *qkv_dropout >> *qkv_dropout_out >>
+        *qkv_add_2;
+    *qkv_transpose2 >> *qkv_transpose2_xshape;
+    *qkv_reshape2 >> *qkv_reshape2_xshape;
+    *qkv_mul_y >> *qkv_mul;
+    *qkv_add_y >> *qkv_add;
+    *qkv_dropout >> *qkv_dropout_mask;
+
+    *input >> *qkv_add_2 >> *qkv_add_2_out >> *qkv_ln_2 >> *qkv_ln_2_out;
+    *qkv_ln_2_scale >> *qkv_ln_2;
+    *qkv_ln_2_bias >> *qkv_ln_2;
+    *qkv_ln_2 >> *qkv_ln_2_mean;
+    *qkv_ln_2 >> *qkv_ln_2_var;
+
+    *qkv_ln_2_out >> *qkv_mul_3 >> *qkv_mul_3_out >> *qkv_add_3 >>
+        *qkv_add_3_out >> *qkv_act >> *qkv_act_out >> *qkv_mul_4 >>
+        *qkv_mul_4_out >> *qkv_add_4 >> *qkv_add_4_out >> *qkv_dropout_4 >>
+        *qkv_dropout_4_out >> *qkv_add_5;
+    *qkv_mul_3_y >> *qkv_mul_3;
+    *qkv_add_3_y >> *qkv_add_3;
+    *qkv_mul_4_y >> *qkv_mul_4;
+    *qkv_add_4_y >> *qkv_add_4;
+    *qkv_dropout_4 >> *qkv_dropout_4_mask;
+
+    *qkv_ln_2_out >> *qkv_add_5 >> *qkv_add_5_out >> *qkv_ln_5 >> *qkv_ln_5_out;
+    *qkv_ln_5_scale >> *qkv_ln_5;
+    *qkv_ln_5_bias >> *qkv_ln_5;
+    *qkv_ln_5 >> *qkv_ln_5_mean;
+    *qkv_ln_5 >> *qkv_ln_5_var;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("single_encoder");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Mask", {matched.at("qk_mask")->arg()->name});
+    op_desc.SetInput("FCWeight",
+                     {
+                         matched.at("q_mul_y")->arg()->name,
+                         matched.at("k_mul_y")->arg()->name,
+                         matched.at("v_mul_y")->arg()->name,
+                         matched.at("qkv_mul_y")->arg()->name,
+                         matched.at("qkv_mul_3_y")->arg()->name,
+                         matched.at("qkv_mul_4_y")->arg()->name,
+                     });
+    op_desc.SetInput("FCBias",
+                     {
+                         matched.at("q_add_y")->arg()->name,
+                         matched.at("k_add_y")->arg()->name,
+                         matched.at("v_add_y")->arg()->name,
+                         matched.at("qkv_add_y")->arg()->name,
+                         matched.at("qkv_add_3_y")->arg()->name,
+                         matched.at("qkv_add_4_y")->arg()->name,
+                     });
+    op_desc.SetInput("LNScale",
+                     {
+                         matched.at("qkv_ln_2_scale")->arg()->name,
+                         matched.at("qkv_ln_5_scale")->arg()->name,
+                     });
+    op_desc.SetInput("LNBias",
+                     {
+                         matched.at("qkv_ln_2_bias")->arg()->name,
+                         matched.at("qkv_ln_5_bias")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("qkv_ln_5_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    // extra traits to distill
+    auto* reshape_op_info = matched.at("q_reshape2")->stmt()->op_info();
+    auto reshape_dim = reshape_op_info->GetAttr<std::vector<int>>("shape");
+    op_desc.SetAttr<int>("head_num", reshape_dim[2]);
+    op_desc.SetAttr<int>("size_per_head", reshape_dim[3]);
+    op_desc.SetAttr<std::string>("act_type", act_type_);
+
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    auto* single_encoder_stmt = matched.at("q_mul")->stmt();
+    fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());
+    single_encoder_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "qk_mask",
+        "k_mul_y",
+        "v_mul_y",
+        "qkv_mul_y",
+        "qkv_mul_3_y",
+        "qkv_mul_4_y",
+        "q_add_y",
+        "k_add_y",
+        "v_add_y",
+        "qkv_add_y",
+        "qkv_add_3_y",
+        "qkv_add_4_y",
+        "qkv_ln_2_scale",
+        "qkv_ln_2_bias",
+        "qkv_ln_5_scale",
+        "qkv_ln_5_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("q_mul"));
+    }
+    IR_OP_VAR_LINK(matched.at("q_mul"), matched.at("qkv_ln_5_out"));
+  }
+
+ private:
+  std::string act_type_;
+};
+
+class XPUMultiEncoderFuser {
+ public:
+  bool IsDirectPredecessorOf(Node* op1, Node* op2) {
+    for (auto* out : op1->outlinks) {
+      for (auto* in : op2->inlinks) {
+        if (out == in) return true;
+      }
+    }
+    return false;
+  }
+
+  void operator()(SSAGraph* graph) {
+    std::vector<Node*> all_encoders;
+    for (auto* node : graph->StmtTopologicalOrder()) {
+      CHECK(node->IsStmt());
+      if (node->stmt()->op_info()->Type() == "single_encoder") {
+        all_encoders.push_back(node);
+      }
+    }
+    VLOG(3) << "Found " << all_encoders.size() << " single_encoder";
+    if (all_encoders.size() == 0) {
+      return;
+    }
+
+    // TODO(miaotianxiang): more verification
+    for (size_t i = 0; i < all_encoders.size() - 1; ++i) {
+      CHECK(IsDirectPredecessorOf(all_encoders[i], all_encoders[i + 1]));
+    }
+    std::string mask_name;
+    for (auto* encoder : all_encoders) {
+      auto* op_info = encoder->stmt()->op_info();
+      if (mask_name.empty()) {
+        mask_name = op_info->Input("Mask").front();
+      } else {
+        // CHECK(mask_name == op_info->Input("Mask").front());
+      }
+    }
+
+    std::unordered_set<const Node*> to_remove;
+    Node* first_encoder = all_encoders[0];
+    std::string in_name, out_name;
+    std::vector<std::string> arg_names{
+        "FCWeight", "FCBias", "LNScale", "LNBias"};
+    std::unordered_map<std::string, std::vector<std::string>> arg_map;
+    for (size_t i = 0; i < all_encoders.size(); ++i) {
+      Node* cur_encoder = all_encoders[i];
+      auto* op_info = cur_encoder->stmt()->op_info();
+      for (auto arg_name : arg_names) {
+        auto real_names = op_info->Input(arg_name);
+        for (auto name : real_names) {
+          auto* arg_node = graph->RetrieveArgument(name);
+          DirectedLink(arg_node, first_encoder);
+          arg_map[arg_name].push_back(name);
+        }
+      }
+
+      auto* cur_out =
+          graph->RetrieveArgument(op_info->Output("Outputs").front());
+      if (i == 0) {
+        // first encoder
+        to_remove.insert(cur_out);
+        in_name = op_info->Input("Inputs").front();
+        mask_name = op_info->Input("Mask").front();
+      } else if (i == all_encoders.size() - 1) {
+        // last encoder
+        to_remove.insert(cur_encoder);
+        DirectedLink(first_encoder, cur_out);
+        out_name = op_info->Output("Outputs").front();
+      } else {
+        to_remove.insert(cur_encoder);
+        to_remove.insert(cur_out);
+      }
+    }
+    GraphSafeRemoveNodes(graph, to_remove);
+
+    auto* multi_encoder_stmt = first_encoder->stmt();
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__multi_encoder");
+    op_desc.SetInput("Input", {in_name});
+    for (auto kv : arg_map) {
+      op_desc.SetInput(kv.first, kv.second);
+    }
+    op_desc.SetInput("Mask", {mask_name});
+    op_desc.SetOutput("Output", {out_name});
+    op_desc.SetAttr<int>("xpu", 1);
+    auto* first_encoder_op_info = multi_encoder_stmt->op_info();
+    op_desc.SetAttr<int>("head_num",
+                         first_encoder_op_info->GetAttr<int>("head_num"));
+    op_desc.SetAttr<int>("size_per_head",
+                         first_encoder_op_info->GetAttr<int>("size_per_head"));
+    op_desc.SetAttr<int>("n_layers", all_encoders.size());
+    op_desc.SetAttr<std::string>(
+        "act_type", first_encoder_op_info->GetAttr<std::string>("act_type"));
+
+    auto* scope = multi_encoder_stmt->op()->scope();
+    std::vector<float> fc_weight_max(arg_map["FCWeight"].size());
+    auto& fc_weight_names = arg_map["FCWeight"];
+    for (size_t i = 0; i < fc_weight_names.size(); ++i) {
+      auto* weight_t = scope->FindMutableTensor(fc_weight_names[i]);
+      auto weight_dims = weight_t->dims();
+      int weight_len = weight_t->numel();
+      float* weight_on_host = weight_t->mutable_data<float>();
+      float max_f =
+          paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
+
+      std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
+      std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
+      paddle::lite::xpu::math::ConvertFP32ToInt16(
+          weight_on_host, weight_int16.get(), max_f, weight_len);
+      paddle::lite::xpu::math::Transpose(weight_int16.get(),
+                                         weight_trans_int16.get(),
+                                         weight_dims[0],
+                                         weight_dims[1]);
+      memcpy(weight_on_host,
+             weight_trans_int16.get(),
+             weight_len * sizeof(int16_t));
+      fc_weight_max[i] = max_f;
+    }
+
+    std::string max_name = "encoder_max";
+    auto* max_filter_node = graph->NewArgumentNode(max_name);
+    max_filter_node->arg()->is_weight = true;
+    max_filter_node->arg()->type = LiteType::GetTensorTy(
+        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    DirectedLink(max_filter_node, first_encoder);
+    auto* max_filter_tensor = scope->NewTensor(max_name);
+    max_filter_tensor->Resize({static_cast<int>(fc_weight_max.size())});
+    memcpy(max_filter_tensor->mutable_data<float>(),
+           &fc_weight_max[0],
+           sizeof(float) * fc_weight_max.size());
+    op_desc.SetInput("FCWeightMax", {max_name});
+
+    auto multi_encoder_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    multi_encoder_op->Attach(op_desc, scope);
+    multi_encoder_op->SetValidPlaces(multi_encoder_stmt->op()->valid_places());
+    auto kernels =
+        multi_encoder_op->CreateKernels(multi_encoder_op->valid_places());
+    multi_encoder_stmt->SetOp(multi_encoder_op);
+    multi_encoder_stmt->SetKernels(std::move(kernels));
+
+    // temp remove useless cast
+    std::unordered_set<const Node*> to_remove2;
+    Node* stack = nullptr;
+    for (auto* node : graph->StmtTopologicalOrder()) {
+      CHECK(node->IsStmt());
+      if (node->stmt()->op_info()->Type() == "stack") {
+        stack = node;
+      }
+    }
+    Node* stack_out = stack->outlinks.front();
+    for (Node* cast : stack_out->outlinks) {
+      Node* cast_out = cast->outlinks.front();
+      if (cast_out->outlinks.size() == 0) {
+        // remove
+        to_remove2.insert(cast_out);
+        to_remove2.insert(cast);
+      }
+    }
+    GraphSafeRemoveNodes(graph, to_remove2);
+  }
+};
+
+}  // namespace fusion
+
+class XPUMultiEncoderFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    // TODO(miaotianxiang): backup graph, recover from failed match
+    std::vector<std::string> act_types{"gelu", "relu"};
+    for (auto& act_type : act_types) {
+      fusion::XPUSingleEncoderFuser single_encoder_fuser(act_type);
+      single_encoder_fuser(graph.get());
+      fusion::XPUMultiEncoderFuser multi_encoder_fuser;
+      multi_encoder_fuser(graph.get());
+    }
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__multi_encoder_fuse_pass,
+                  paddle::lite::mir::XPUMultiEncoderFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("matmul");
diff --git a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de2210a76ea0647cb02131a088ceb754afd0ef9c
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
@@ -0,0 +1,951 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/operators/subgraph_op.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class XPUResNetBlock0Fuser : public FuseBase {
+ public:
+  XPUResNetBlock0Fuser() {}
+
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+
+    auto* left_conv1_weight = VarNode("left_conv1_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv1 = OpNode("left_conv1", "conv2d");
+    auto* left_conv1_out = VarNode("left_conv1_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn1_scale = VarNode("left_bn1_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn1_bias = VarNode("left_bn1_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn1_mean = VarNode("left_bn1_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn1_var = VarNode("left_bn1_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate();
+    auto* left_bn1_out = VarNode("left_bn1_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn1_mean_out = VarNode("left_bn1_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn1_var_out =
+        VarNode("left_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn1_saved_mean =
+        VarNode("left_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn1_saved_var =
+        VarNode("left_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate();
+    auto* left_relu1_out = VarNode("left_relu1_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+
+    auto* left_conv2_weight = VarNode("left_conv2_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate();
+    auto* left_conv2_out = VarNode("left_conv2_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn2_scale = VarNode("left_bn2_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn2_bias = VarNode("left_bn2_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn2_mean = VarNode("left_bn2_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn2_var = VarNode("left_bn2_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate();
+    auto* left_bn2_out = VarNode("left_bn2_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn2_mean_out = VarNode("left_bn2_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn2_var_out =
+        VarNode("left_bn2_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn2_saved_mean =
+        VarNode("left_bn2_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn2_saved_var =
+        VarNode("left_bn2_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate();
+    auto* left_relu2_out = VarNode("left_relu2_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+
+    auto* left_conv3_weight = VarNode("left_conv3_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate();
+    auto* left_conv3_out = VarNode("left_conv3_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn3_scale = VarNode("left_bn3_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn3_bias = VarNode("left_bn3_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn3_mean = VarNode("left_bn3_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn3_var = VarNode("left_bn3_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate();
+    auto* left_bn3_out = VarNode("left_bn3_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("elementwise_add", "Y")
+                             ->AsIntermediate();
+    auto* left_bn3_mean_out = VarNode("left_bn3_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn3_var_out =
+        VarNode("left_bn3_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn3_saved_mean =
+        VarNode("left_bn3_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn3_saved_var =
+        VarNode("left_bn3_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    auto* right_conv1_weight = VarNode("right_conv1_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate();
+    auto* right_conv1_out = VarNode("right_conv1_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn1_scale = VarNode("right_bn1_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn1_bias = VarNode("right_bn1_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn1_mean = VarNode("right_bn1_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn1_var = VarNode("right_bn1_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
+    auto* right_bn1_out = VarNode("right_bn1_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* right_bn1_mean_out =
+        VarNode("right_bn1_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn1_var_out =
+        VarNode("right_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn1_saved_mean =
+        VarNode("right_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn1_saved_var =
+        VarNode("right_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
+    auto* add_out = VarNode("add_out")
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("relu", "X")
+                        ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+
+    *input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >>
+        *left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >>
+        *left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >>
+        *left_conv3 >> *left_conv3_out >> *left_bn3 >> *left_bn3_out >> *add;
+
+    *left_conv1_weight >> *left_conv1;
+    *left_bn1_scale >> *left_bn1;
+    *left_bn1_bias >> *left_bn1;
+    *left_bn1_mean >> *left_bn1;
+    *left_bn1_var >> *left_bn1;
+    *left_bn1 >> *left_bn1_mean_out;
+    *left_bn1 >> *left_bn1_var_out;
+    *left_bn1 >> *left_bn1_saved_mean;
+    *left_bn1 >> *left_bn1_saved_var;
+
+    *left_conv2_weight >> *left_conv2;
+    *left_bn2_scale >> *left_bn2;
+    *left_bn2_bias >> *left_bn2;
+    *left_bn2_mean >> *left_bn2;
+    *left_bn2_var >> *left_bn2;
+    *left_bn2 >> *left_bn2_mean_out;
+    *left_bn2 >> *left_bn2_var_out;
+    *left_bn2 >> *left_bn2_saved_mean;
+    *left_bn2 >> *left_bn2_saved_var;
+
+    *left_conv3_weight >> *left_conv3;
+    *left_bn3_scale >> *left_bn3;
+    *left_bn3_bias >> *left_bn3;
+    *left_bn3_mean >> *left_bn3;
+    *left_bn3_var >> *left_bn3;
+    *left_bn3 >> *left_bn3_mean_out;
+    *left_bn3 >> *left_bn3_var_out;
+    *left_bn3 >> *left_bn3_saved_mean;
+    *left_bn3 >> *left_bn3_saved_var;
+
+    *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
+        *right_bn1_out >> *add;
+
+    *right_conv1_weight >> *right_conv1;
+    *right_bn1_scale >> *right_bn1;
+    *right_bn1_bias >> *right_bn1;
+    *right_bn1_mean >> *right_bn1;
+    *right_bn1_var >> *right_bn1;
+    *right_bn1 >> *right_bn1_mean_out;
+    *right_bn1 >> *right_bn1_var_out;
+    *right_bn1 >> *right_bn1_saved_mean;
+    *right_bn1 >> *right_bn1_saved_var;
+
+    *add >> *add_out >> *relu >> *relu_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_block0");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter",
+                     {
+                         matched.at("left_conv1_weight")->arg()->name,
+                         matched.at("left_conv2_weight")->arg()->name,
+                         matched.at("left_conv3_weight")->arg()->name,
+                         matched.at("right_conv1_weight")->arg()->name,
+                     });
+    op_desc.SetInput("Scale",
+                     {
+                         matched.at("left_bn1_scale")->arg()->name,
+                         matched.at("left_bn2_scale")->arg()->name,
+                         matched.at("left_bn3_scale")->arg()->name,
+                         matched.at("right_bn1_scale")->arg()->name,
+                     });
+    op_desc.SetInput("Bias",
+                     {
+                         matched.at("left_bn1_bias")->arg()->name,
+                         matched.at("left_bn2_bias")->arg()->name,
+                         matched.at("left_bn3_bias")->arg()->name,
+                         matched.at("right_bn1_bias")->arg()->name,
+                     });
+    op_desc.SetInput("Mean",
+                     {
+                         matched.at("left_bn1_mean")->arg()->name,
+                         matched.at("left_bn2_mean")->arg()->name,
+                         matched.at("left_bn3_mean")->arg()->name,
+                         matched.at("right_bn1_mean")->arg()->name,
+                     });
+    op_desc.SetInput("Var",
+                     {
+                         matched.at("left_bn1_variance")->arg()->name,
+                         matched.at("left_bn2_variance")->arg()->name,
+                         matched.at("left_bn3_variance")->arg()->name,
+                         matched.at("right_bn1_variance")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    auto block0_stmt = matched.at("left_conv1")->stmt();
+    // block0_stmt->ResetOp(op_desc, graph->valid_places());
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
+    block0_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "left_conv2_weight",
+        "left_conv3_weight",
+        "right_conv1_weight",
+        "left_bn1_bias",
+        "left_bn2_bias",
+        "left_bn3_bias",
+        "right_bn1_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1"));
+    }
+    IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out"));
+  }
+};
+
+class XPUResNetBlock1Fuser : public FuseBase {
+ public:
+  XPUResNetBlock1Fuser() {}
+
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("conv2d", "Input")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsInput();
+
+    auto* right_conv1_weight = VarNode("right_conv1_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv1 = OpNode("right_conv1", "conv2d");
+    auto* right_conv1_out = VarNode("right_conv1_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn1_scale = VarNode("right_bn1_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn1_bias = VarNode("right_bn1_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn1_mean = VarNode("right_bn1_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn1_var = VarNode("right_bn1_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
+    auto* right_bn1_out = VarNode("right_bn1_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("relu", "X")
+                              ->AsIntermediate();
+    auto* right_bn1_mean_out =
+        VarNode("right_bn1_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn1_var_out =
+        VarNode("right_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn1_saved_mean =
+        VarNode("right_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn1_saved_var =
+        VarNode("right_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate();
+    auto* right_relu1_out = VarNode("right_relu1_out")
+                                ->assert_is_op_output("relu", "Out")
+                                ->assert_is_op_input("conv2d", "Input")
+                                ->AsIntermediate();
+
+    auto* right_conv2_weight = VarNode("right_conv2_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate();
+    auto* right_conv2_out = VarNode("right_conv2_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn2_scale = VarNode("right_bn2_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn2_bias = VarNode("right_bn2_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn2_mean = VarNode("right_bn2_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn2_var = VarNode("right_bn2_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate();
+    auto* right_bn2_out = VarNode("right_bn2_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("relu", "X")
+                              ->AsIntermediate();
+    auto* right_bn2_mean_out =
+        VarNode("right_bn2_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn2_var_out =
+        VarNode("right_bn2_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn2_saved_mean =
+        VarNode("right_bn2_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn2_saved_var =
+        VarNode("right_bn2_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate();
+    auto* right_relu2_out = VarNode("right_relu2_out")
+                                ->assert_is_op_output("relu", "Out")
+                                ->assert_is_op_input("conv2d", "Input")
+                                ->AsIntermediate();
+
+    auto* right_conv3_weight = VarNode("right_conv3_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate();
+    auto* right_conv3_out = VarNode("right_conv3_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn3_scale = VarNode("right_bn3_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn3_bias = VarNode("right_bn3_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn3_mean = VarNode("right_bn3_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn3_var = VarNode("right_bn3_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate();
+    auto* right_bn3_out = VarNode("right_bn3_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("elementwise_add", "Y")
+                              ->AsIntermediate();
+    auto* right_bn3_mean_out =
+        VarNode("right_bn3_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn3_var_out =
+        VarNode("right_bn3_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn3_saved_mean =
+        VarNode("right_bn3_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn3_saved_var =
+        VarNode("right_bn3_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
+    auto* add_out = VarNode("add_out")
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("relu", "X")
+                        ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+
+    *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
+        *right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >>
+        *right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >>
+        *right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >>
+        *right_bn3_out >> *add;
+
+    *right_conv1_weight >> *right_conv1;
+    *right_bn1_scale >> *right_bn1;
+    *right_bn1_bias >> *right_bn1;
+    *right_bn1_mean >> *right_bn1;
+    *right_bn1_var >> *right_bn1;
+    *right_bn1 >> *right_bn1_mean_out;
+    *right_bn1 >> *right_bn1_var_out;
+    *right_bn1 >> *right_bn1_saved_mean;
+    *right_bn1 >> *right_bn1_saved_var;
+
+    *right_conv2_weight >> *right_conv2;
+    *right_bn2_scale >> *right_bn2;
+    *right_bn2_bias >> *right_bn2;
+    *right_bn2_mean >> *right_bn2;
+    *right_bn2_var >> *right_bn2;
+    *right_bn2 >> *right_bn2_mean_out;
+    *right_bn2 >> *right_bn2_var_out;
+    *right_bn2 >> *right_bn2_saved_mean;
+    *right_bn2 >> *right_bn2_saved_var;
+
+    *right_conv3_weight >> *right_conv3;
+    *right_bn3_scale >> *right_bn3;
+    *right_bn3_bias >> *right_bn3;
+    *right_bn3_mean >> *right_bn3;
+    *right_bn3_var >> *right_bn3;
+    *right_bn3 >> *right_bn3_mean_out;
+    *right_bn3 >> *right_bn3_var_out;
+    *right_bn3 >> *right_bn3_saved_mean;
+    *right_bn3 >> *right_bn3_saved_var;
+
+    *input >> *add;
+
+    *add >> *add_out >> *relu >> *relu_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_block1");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter",
+                     {
+                         matched.at("right_conv1_weight")->arg()->name,
+                         matched.at("right_conv2_weight")->arg()->name,
+                         matched.at("right_conv3_weight")->arg()->name,
+                     });
+    op_desc.SetInput("Scale",
+                     {
+                         matched.at("right_bn1_scale")->arg()->name,
+                         matched.at("right_bn2_scale")->arg()->name,
+                         matched.at("right_bn3_scale")->arg()->name,
+                     });
+    op_desc.SetInput("Bias",
+                     {
+                         matched.at("right_bn1_bias")->arg()->name,
+                         matched.at("right_bn2_bias")->arg()->name,
+                         matched.at("right_bn3_bias")->arg()->name,
+                     });
+    op_desc.SetInput("Mean",
+                     {
+                         matched.at("right_bn1_mean")->arg()->name,
+                         matched.at("right_bn2_mean")->arg()->name,
+                         matched.at("right_bn3_mean")->arg()->name,
+                     });
+    op_desc.SetInput("Var",
+                     {
+                         matched.at("right_bn1_variance")->arg()->name,
+                         matched.at("right_bn2_variance")->arg()->name,
+                         matched.at("right_bn3_variance")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    auto block1_stmt = matched.at("right_conv1")->stmt();
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
+    block1_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "right_conv2_weight",
+        "right_conv3_weight",
+        "right_bn1_bias",
+        "right_bn2_bias",
+        "right_bn3_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1"));
+    }
+    IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out"));
+  }
+};
+
+class XPUResNet50Fuser : public xpu::XPUFuseBase {
+ public:
+  XPUResNet50Fuser() {}
+
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+
+    auto* top_conv_weight = VarNode("top_conv_weight")
+                                ->assert_is_op_input("conv2d", "Filter")
+                                ->AsInput();
+    auto* top_conv = OpNode("top_conv", "conv2d");
+    auto* top_conv_out = VarNode("top_conv_out")
+                             ->assert_is_op_output("conv2d", "Output")
+                             ->assert_is_op_input("batch_norm", "X")
+                             ->AsIntermediate();
+    auto* top_bn_scale = VarNode("top_bn_scale")
+                             ->assert_is_op_input("batch_norm", "Scale")
+                             ->AsIntermediate();
+    auto* top_bn_bias = VarNode("top_bn_bias")
+                            ->assert_is_op_input("batch_norm", "Bias")
+                            ->AsInput();
+    auto* top_bn_mean = VarNode("top_bn_mean")
+                            ->assert_is_op_input("batch_norm", "Mean")
+                            ->AsIntermediate();
+    auto* top_bn_var = VarNode("top_bn_variance")
+                           ->assert_is_op_input("batch_norm", "Variance")
+                           ->AsIntermediate();
+    auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate();
+    auto* top_bn_out = VarNode("top_bn_out")
+                           ->assert_is_op_output("batch_norm", "Y")
+                           ->assert_is_op_input("relu", "X")
+                           ->AsIntermediate();
+    auto* top_bn_mean_out = VarNode("top_bn_mean_out")
+                                ->assert_is_op_output("batch_norm", "MeanOut")
+                                ->AsIntermediate();
+    auto* top_bn_var_out =
+        VarNode("top_bn_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* top_bn_saved_mean =
+        VarNode("top_bn_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* top_bn_saved_var =
+        VarNode("top_bn_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate();
+    auto* top_relu_out = VarNode("top_relu_out")
+                             ->assert_is_op_output("relu", "Out")
+                             ->assert_is_op_input("pool2d", "X")
+                             ->AsIntermediate();
+    auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate();
+    auto* top_pool_out = VarNode("top_pool_out")
+                             ->assert_is_op_output("pool2d", "Out")
+                             ->assert_is_op_input("resnet_block0", "Inputs")
+                             ->AsIntermediate();
+
+    // args are left out
+    auto* resnet_block0_1 =
+        OpNode("resnet_block0_1", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_1_out =
+        VarNode("resnet_block0_1_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_1 =
+        OpNode("resnet_block1_1_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_1_1_out =
+        VarNode("resnet_block1_1_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_2 =
+        OpNode("resnet_block1_1_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_1_2_out =
+        VarNode("resnet_block1_1_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_2 =
+        OpNode("resnet_block0_2", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_2_out =
+        VarNode("resnet_block0_2_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_1 =
+        OpNode("resnet_block1_2_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_1_out =
+        VarNode("resnet_block1_2_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_2 =
+        OpNode("resnet_block1_2_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_2_out =
+        VarNode("resnet_block1_2_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_3 =
+        OpNode("resnet_block1_2_3", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_3_out =
+        VarNode("resnet_block1_2_3_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_3 =
+        OpNode("resnet_block0_3", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_3_out =
+        VarNode("resnet_block0_3_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_1 =
+        OpNode("resnet_block1_3_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_1_out =
+        VarNode("resnet_block1_3_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_2 =
+        OpNode("resnet_block1_3_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_2_out =
+        VarNode("resnet_block1_3_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_3 =
+        OpNode("resnet_block1_3_3", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_3_out =
+        VarNode("resnet_block1_3_3_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_4 =
+        OpNode("resnet_block1_3_4", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_4_out =
+        VarNode("resnet_block1_3_4_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_5 =
+        OpNode("resnet_block1_3_5", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_5_out =
+        VarNode("resnet_block1_3_5_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_4 =
+        OpNode("resnet_block0_4", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_4_out =
+        VarNode("resnet_block0_4_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_1 =
+        OpNode("resnet_block1_4_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_4_1_out =
+        VarNode("resnet_block1_4_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_2 =
+        OpNode("resnet_block1_4_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_4_2_out =
+        VarNode("resnet_block1_4_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* bottom_pool = OpNode("bottom_pool", "pool2d")->AsIntermediate();
+    auto* bottom_pool_out = VarNode("bottom_pool_out")
+                                ->assert_is_op_output("pool2d", "Out")
+                                ->AsOutput();
+
+    *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >>
+        *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >>
+        *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >>
+        *resnet_block1_1_1_out >> *resnet_block1_1_2 >>
+        *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >>
+        *resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >>
+        *resnet_block1_2_2_out >> *resnet_block1_2_3 >>
+        *resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >>
+        *resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >>
+        *resnet_block1_3_2_out >> *resnet_block1_3_3 >>
+        *resnet_block1_3_3_out >> *resnet_block1_3_4 >>
+        *resnet_block1_3_4_out >> *resnet_block1_3_5 >>
+        *resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >>
+        *resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >>
+        *resnet_block1_4_2_out >> *bottom_pool >> *bottom_pool_out;
+
+    *top_conv_weight >> *top_conv;
+    *top_bn_scale >> *top_bn;
+    *top_bn_bias >> *top_bn;
+    *top_bn_mean >> *top_bn;
+    *top_bn_var >> *top_bn;
+    *top_bn >> *top_bn_mean_out;
+    *top_bn >> *top_bn_var_out;
+    *top_bn >> *top_bn_saved_mean;
+    *top_bn >> *top_bn_saved_var;
+  }
+
+  void InsertNewNode(SSAGraph* graph,
+                     const key2nodes_t& matched,
+                     const std::vector<Node*>& extra_input_vars) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__resnet50");
+    op_desc.SetInput("Input", {matched.at("input")->arg()->name});
+    std::vector<std::string> filter_name = {
+        matched.at("top_conv_weight")->arg()->name};
+    std::vector<std::string> scale_name = {
+        matched.at("top_bn_scale")->arg()->name};
+    std::vector<std::string> bias_name = {
+        matched.at("top_bn_bias")->arg()->name};
+    std::vector<std::string> mean_name = {
+        matched.at("top_bn_mean")->arg()->name};
+    std::vector<std::string> var_name = {
+        matched.at("top_bn_variance")->arg()->name};
+    std::vector<std::string> max_filter_name;
+    std::vector<std::string> resnet_block_vec = {
+        "resnet_block0_1",
+        "resnet_block1_1_1",
+        "resnet_block1_1_2",
+        "resnet_block0_2",
+        "resnet_block1_2_1",
+        "resnet_block1_2_2",
+        "resnet_block1_2_3",
+        "resnet_block0_3",
+        "resnet_block1_3_1",
+        "resnet_block1_3_2",
+        "resnet_block1_3_3",
+        "resnet_block1_3_4",
+        "resnet_block1_3_5",
+        "resnet_block0_4",
+        "resnet_block1_4_1",
+        "resnet_block1_4_2",
+    };
+    for (auto& block : resnet_block_vec) {
+      auto* block_op_info = matched.at(block)->stmt()->op_info();
+      auto block_filter_name = block_op_info->Input("Filter");
+      std::copy(block_filter_name.begin(),
+                block_filter_name.end(),
+                std::back_inserter(filter_name));
+      auto block_scale_name = block_op_info->Input("Scale");
+      std::copy(block_scale_name.begin(),
+                block_scale_name.end(),
+                std::back_inserter(scale_name));
+      auto block_bias_name = block_op_info->Input("Bias");
+      std::copy(block_bias_name.begin(),
+                block_bias_name.end(),
+                std::back_inserter(bias_name));
+      auto block_mean_name = block_op_info->Input("Mean");
+      std::copy(block_mean_name.begin(),
+                block_mean_name.end(),
+                std::back_inserter(mean_name));
+      auto block_var_name = block_op_info->Input("Var");
+      std::copy(block_var_name.begin(),
+                block_var_name.end(),
+                std::back_inserter(var_name));
+    }
+    op_desc.SetInput("Filter", filter_name);
+    op_desc.SetInput("Bias", bias_name);
+    op_desc.SetOutput("Output", {matched.at("bottom_pool_out")->arg()->name});
+    op_desc.SetAttr<int>("xpu", 1);
+
+    auto* resnet50_stmt = matched.at("top_conv")->stmt();
+    auto* scope = resnet50_stmt->op()->scope();
+    for (size_t i = 0; i < filter_name.size(); ++i) {
+      auto* filter_t = scope->FindMutableTensor(filter_name[i]);
+      auto* scale_t = scope->FindMutableTensor(scale_name[i]);
+      auto* bias_t = scope->FindMutableTensor(bias_name[i]);
+      auto* mean_t = scope->FindMutableTensor(mean_name[i]);
+      auto* var_t = scope->FindMutableTensor(var_name[i]);
+
+      int mean_len = mean_t->numel();
+      int filter_len = filter_t->numel();
+      int filter_stride = filter_len / mean_len;
+
+      float* filter_on_host = filter_t->mutable_data<float>();
+      float* scale_on_host = scale_t->mutable_data<float>();
+      float* bias_on_host = bias_t->mutable_data<float>();
+      float* mean_on_host = mean_t->mutable_data<float>();
+      float* var_on_host = var_t->mutable_data<float>();
+
+      // Perform preprocess
+      for (int i = 0; i < mean_len; ++i) {
+        scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f);
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        for (int j = 0; j < filter_stride; ++j) {
+          filter_on_host[i * filter_stride + j] *= scale_on_host[i];
+        }
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        bias_on_host[i] += -mean_on_host[i] * scale_on_host[i];
+      }
+
+      float max_f =
+          paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+      std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+      paddle::lite::xpu::math::ConvertFP32ToInt16(
+          filter_on_host, filter_int16.get(), max_f, filter_len);
+      memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+
+      // create new arg in graph and scope
+      std::string max_name = filter_name[i] + "_max";
+      max_filter_name.push_back(max_name);
+      auto* max_filter_node = graph->NewArgumentNode(max_name);
+      max_filter_node->arg()->is_weight = true;
+      max_filter_node->arg()->type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+      DirectedLink(max_filter_node, matched.at("top_conv"));
+      auto* max_filter_t = scope->NewTensor(max_name);
+      max_filter_t->Resize({4});
+      float* max_ptr = max_filter_t->mutable_data<float>();
+      max_ptr[0] = max_f;
+      max_ptr[1] = max_f;
+      max_ptr[2] = max_f;
+      max_ptr[3] = max_f;
+    }
+    op_desc.SetInput("MaxFilter", max_filter_name);
+
+    auto resnet50_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    resnet50_op->Attach(op_desc, scope);
+    resnet50_op->SetValidPlaces(resnet50_stmt->op()->valid_places());
+    auto kernels = resnet50_op->CreateKernels(resnet50_op->valid_places());
+    resnet50_stmt->SetOp(resnet50_op);
+    resnet50_stmt->SetKernels(std::move(kernels));
+
+    IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv"));
+    for (auto* node : extra_input_vars) {
+      IR_NODE_LINK_TO(node, matched.at("top_conv"));
+    }
+    IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("bottom_pool_out"));
+  }
+};
+
+}  // namespace fusion
+
+class XPUResNet50FusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    fusion::XPUResNetBlock0Fuser block0_fuser;
+    block0_fuser(graph.get());
+    fusion::XPUResNetBlock1Fuser block1_fuser;
+    block1_fuser(graph.get());
+    fusion::XPUResNet50Fuser resnet50_fuser;
+    resnet50_fuser(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__resnet_fuse_pass,
+                  paddle::lite::mir::XPUResNet50FusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("conv2d");
diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
index f5a7837b53650e08f9632b499a4c2ab1faeaeedf..4393832931c95ca20e34ca3b3d2fb4501274b15f 100644
--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
@@ -26,7 +26,8 @@ namespace mir {
 void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // initialze fuser params
   std::vector<bool> conv_has_bias_cases{true, false};
-  std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
+  std::vector<std::string> conv_type_cases{
+      "conv2d", "depthwise_conv2d", "conv2d_transpose"};
   // start fuse using params
   for (auto conv_has_bias : conv_has_bias_cases) {
     for (auto conv_type : conv_type_cases) {
diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc
index 0f5bb64e10dd61c3edf4ddd32569a2d365651cdf..43869beddd0af701d5f78ea047b30f6b136e6b3f 100644
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -103,14 +103,20 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
   auto conv_weight_t =
       scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
-  CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
-           static_cast<size_t>(conv_weight_t->dims()[0]))
-      << "The BN bias's size should be equal to the size of the first "
-      << "dim size of the conv weights";
+  if (conv_type_ == "conv2d_transpose") {
+    CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
+             static_cast<size_t>(conv_weight_t->dims()[1]))
+        << "The BN bias's size should be equal to the size of the first "
+        << "dim size of the conv weights";
+  } else {
+    CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
+             static_cast<size_t>(conv_weight_t->dims()[0]))
+        << "The BN bias's size should be equal to the size of the first "
+        << "dim size of the conv weights";
+  }
   size_t weight_num = conv_weight_t->data_size();
   bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
-  bool is_weight_quantization =
-      conv_op_desc->HasAttr("quantize_weight_bits") ? true : false;
+  bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits");
 
   // comupte BN alpha and beta
   Tensor alpha_tensor, beta_tensor;
@@ -153,12 +159,29 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
     // compute new conv_weight for int8
     auto weight_scale =
         conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
-    for (unsigned int i = 0; i < h; ++i) {
-      weight_scale[i] *= fabsf(alpha_data[i]);
-      if (alpha_data[i] < 0.f) {
-        auto ptr_row = conv_weight_d + i * w;
-        for (unsigned int j = 0; j < w; ++j) {
-          ptr_row[j] *= -1;
+    if (conv_type_ == "conv2d_transpose") {
+      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
+                   conv_weight_t->dims()[3];
+      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
+      for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+        for (unsigned int i = 0; i < h; ++i) {
+          weight_scale[i] *= fabsf(alpha_data[i]);
+          if (alpha_data[i] < 0.f) {
+            auto ptr_row = conv_weight_d + k * c_size + i * hw;
+            for (unsigned int j = 0; j < hw; ++j) {
+              ptr_row[j] *= -1;
+            }
+          }
+        }
+      }
+    } else {
+      for (unsigned int i = 0; i < h; ++i) {
+        weight_scale[i] *= fabsf(alpha_data[i]);
+        if (alpha_data[i] < 0.f) {
+          auto ptr_row = conv_weight_d + i * w;
+          for (unsigned int j = 0; j < w; ++j) {
+            ptr_row[j] *= -1;
+          }
         }
       }
     }
@@ -176,9 +199,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   } else {
     // compute new conv_weight
     auto conv_weight_d = conv_weight_t->mutable_data<float>();
-    for (unsigned int i = 0; i < h; ++i) {    // n: conv2d output channels
-      for (unsigned int j = 0; j < w; ++j) {  // w: conv2d input channels
-        conv_weight_d[i * w + j] *= alpha_data[i];
+    if (conv_type_ == "conv2d_transpose") {
+      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
+                   conv_weight_t->dims()[3];
+      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
+      for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+        for (unsigned int i = 0; i < h; ++i) {
+          auto ptr_row = conv_weight_d + k * c_size + i * hw;
+          for (unsigned int j = 0; j < hw; ++j) {
+            ptr_row[j] *= alpha_data[i];
+          }
+        }
+      }
+    } else {
+      for (unsigned int i = 0; i < h; ++i) {    // n: conv2d output channels
+        for (unsigned int j = 0; j < w; ++j) {  // w: conv2d input channels
+          conv_weight_d[i * w + j] *= alpha_data[i];
+        }
       }
     }
   }
diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
index ab81f3d809507dd340056c97a39998c908a75dc7..80a033c75f2e23efa091375ee2a9f78e3ff40d71 100644
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -44,11 +44,9 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     fuser(graph.get());
   }
 
-  // delete quant_dequant_node
-  for (auto op_type : {"pool2d", "softmax", "elementwise_add"}) {
-    fusion::DeleteQuantDequantOpFuser fuser(op_type);
-    fuser(graph.get());
-  }
+  // process quant_dequant_node
+  fusion::DeleteQuantDequantOpFuser dqd_fuser;
+  dqd_fuser(graph.get());
 }
 
 }  // namespace mir
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index 7797864a2e4b75f52fd7da93ea81613a2175f423..a3a98b871fb4b6f8230299cda978b0f1f8faa779 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -50,7 +50,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
   auto* output_scale_node = matched.at("output_scale_node");
   auto* output_act_node = matched.at("output_act_node");
 
-  // obtain values, save values and relink node
+  // obtain scale, save attrs and relink node
   int bit_length = quant_node->stmt()->op_info()->GetAttr<int>("bit_length");
   int range = ((1 << (bit_length - 1)) - 1);
   auto* scope = quant_node->stmt()->op()->scope();
@@ -58,11 +58,22 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
                            ->GetMutable<lite::Tensor>();
   float scale_value = scale_tensor->data<float>()[0] / range;
 
+  auto in_act_name = input_act_node->arg()->name;
+  auto out_act_name = output_act_node->arg()->name;
   auto outlinks = output_act_node->outlinks;
   for (auto* quantized_node : outlinks) {
-    auto* op_desc = quantized_node->stmt()->mutable_op_info();
-    op_desc->SetAttr<int>("bit_length", bit_length);
-    op_desc->SetAttr<float>("input_scale", scale_value);
+    // save input scale in quantized op by input argname + index
+    auto op_desc = *quantized_node->stmt()->mutable_op_info();
+    std::string argname;
+    int index;
+    op_desc.GetInputArgname(out_act_name, &argname);
+    op_desc.GetInputIndex(out_act_name, &index);
+    op_desc.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
+                           scale_value);
+    op_desc.SetAttr<float>("input_scale", scale_value);  // save it for now
+    op_desc.SetAttr<int>("bit_length", bit_length);
+    op_desc.UpdateAllInputs(out_act_name, in_act_name);
+    quantized_node->stmt()->ResetOp(op_desc, graph->valid_places());
     IR_NODE_LINK_TO(input_act_node, quantized_node)
   }
 
@@ -125,19 +136,18 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
   auto* dequant_op = matched.at("dequant_op");
   auto* dequant_op_out = matched.at("dequant_op_out");
 
-  // obtain input_scale and weight_scale
+  // obtain weight_scale from max_range
   auto* scope = quantized_op->stmt()->op()->scope();
   auto& valid_places = quantized_op->stmt()->op()->valid_places();
   int bit_length = quantized_op->stmt()->op_info()->GetAttr<int>("bit_length");
   int range = ((1 << (bit_length - 1)) - 1);
-  float input_scale =
-      quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
   float max_range = dequant_op->stmt()->op_info()->GetAttr<float>("max_range");
   float whole_weight_scale =
       static_cast<float>(range * range) / max_range / range;
-  // max_range = range * range / max(abs(weight))
-  // weight_scale = range * range / (range * range / max(abs(weight))) / range
-  //              = max(abs(weight)) / range
+  // As: max_range = range * range / max(abs(weight))
+  // So: whole_weight_scale
+  //        = range * range / (range * range / max(abs(weight))) / range
+  //        = max(abs(weight)) / range
 
   // set op desc
   cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
@@ -153,7 +163,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
     // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
     // be Cout.
     weight_scale_size = quantized_weight_t->dims()[0];
-  } else if (quantized_op_type_ == "mul") {
+  } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") {
     op_desc.SetInput("X", {quantized_op_input->arg()->name});
     op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
     // Fc weight: Cin * Cout, the weight_scale_size should be Cout.
@@ -163,7 +173,6 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
     weight_scale.push_back(whole_weight_scale);
   }
   op_desc.SetAttr("enable_int8", true);
-  op_desc.SetAttr("input_scale", input_scale);
   op_desc.SetAttr("weight_scale", weight_scale);
 
   // change the weight from the float type to int8 type.
@@ -209,6 +218,7 @@ void ChannelWiseDequantOpFuser::BuildPattern() {
                                ->assert_is_op_output(quantized_op_type_)
                                ->assert_is_op_input(dequant_op_type, "X")
                                ->AsIntermediate();
+  // The scale var_node of input activation is deleted in DeleteQuantOpFuser
   auto* dequant_op_channel_scale = VarNode("dequant_op_channel_scale")
                                        ->assert_is_op_input(dequant_op_type)
                                        ->AsIntermediate();
@@ -237,11 +247,9 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
   auto* dequant_op = matched.at("dequant_op");
   auto* dequant_op_out = matched.at("dequant_op_out");
 
-  // obtain input_scale and weight_scale
+  // obtain input weight_scale from fake_dequant op
   auto* scope = quantized_op->stmt()->op()->scope();
   auto& valid_places = quantized_op->stmt()->op()->valid_places();
-  float input_scale =
-      quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
 
   std::vector<float> weight_scale;
   std::vector<int> quant_bits =
@@ -258,11 +266,15 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
 
   // set op desc
   cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
-  op_desc.SetInput("Input", {quantized_op_input->arg()->name});
-  op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
-
+  if (quantized_op_type_ == "conv2d" ||
+      quantized_op_type_ == "depthwise_conv2d") {
+    op_desc.SetInput("Input", {quantized_op_input->arg()->name});
+    op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
+  } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") {
+    op_desc.SetInput("X", {quantized_op_input->arg()->name});
+    op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
+  }
   op_desc.SetAttr("enable_int8", true);
-  op_desc.SetAttr("input_scale", input_scale);
   op_desc.SetAttr("weight_scale", weight_scale);
 
   // change the weight from the float type to int8 type.
@@ -297,167 +309,65 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
 void DeleteQuantDequantOpFuser::BuildPattern() {
   std::string quant_dequant_op_type =
       "fake_quantize_dequantize_moving_average_abs_max";
-  if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") {
-    auto* input_scale_node =
-        VarNode("input_scale_node")
-            ->assert_is_op_input(quant_dequant_op_type, "InScale");
-    auto* input_act_node = VarNode("input_act_node")
-                               ->assert_is_op_input(quant_dequant_op_type, "X");
-    auto* quant_dequant_node =
-        OpNode("quant_dequant_node", quant_dequant_op_type)
-            ->assert_is_op(quant_dequant_op_type);
-    auto* output_scale_node =
-        VarNode("output_scale_node")
-            ->assert_is_op_output(quant_dequant_op_type, "OutScale");
-    auto* output_act_node =
-        VarNode("output_act_node")
-            ->assert_is_op_output(quant_dequant_op_type, "Out");
-    auto* quantized_node = OpNode("quantized_node", quantized_op_type_)
-                               ->assert_is_op(quantized_op_type_);
-
-    quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
-    output_scale_node->LinksFrom({quant_dequant_node});
-    output_act_node->LinksFrom({quant_dequant_node});
-    quantized_node->LinksFrom({output_act_node});
-  } else if (quantized_op_type_ == "elementwise_add") {
-    auto* input_scale_left_node =
-        VarNode("input_scale_left_node")
-            ->assert_is_op_input(quant_dequant_op_type, "InScale");
-    auto* input_act_left_node =
-        VarNode("input_act_left_node")
-            ->assert_is_op_input(quant_dequant_op_type, "X");
-    auto* quant_dequant_left_node =
-        OpNode("quant_dequant_left_node", quant_dequant_op_type)
-            ->assert_is_op(quant_dequant_op_type);
-    auto* output_scale_left_node =
-        VarNode("output_scale_left_node")
-            ->assert_is_op_output(quant_dequant_op_type, "OutScale");
-    auto* output_act_left_node =
-        VarNode("output_act_left_node")
-            ->assert_is_op_output(quant_dequant_op_type, "Out")
-            ->assert_is_op_input(quantized_op_type_, "X");
-    quant_dequant_left_node->LinksFrom(
-        {input_scale_left_node, input_act_left_node});
-    output_scale_left_node->LinksFrom({quant_dequant_left_node});
-    output_act_left_node->LinksFrom({quant_dequant_left_node});
-
-    auto* input_scale_right_node =
-        VarNode("input_scale_right_node")
-            ->assert_is_op_input(quant_dequant_op_type, "InScale");
-    auto* input_act_right_node =
-        VarNode("input_act_right_node")
-            ->assert_is_op_input(quant_dequant_op_type, "X");
-    auto* quant_dequant_right_node =
-        OpNode("quant_dequant_right_node", quant_dequant_op_type)
-            ->assert_is_op(quant_dequant_op_type);
-    auto* output_scale_right_node =
-        VarNode("output_scale_right_node")
-            ->assert_is_op_output(quant_dequant_op_type, "OutScale");
-    auto* output_act_right_node =
-        VarNode("output_act_right_node")
-            ->assert_is_op_output(quant_dequant_op_type, "Out")
-            ->assert_is_op_input(quantized_op_type_, "Y");
-    quant_dequant_right_node->LinksFrom(
-        {input_scale_right_node, input_act_right_node});
-    output_scale_right_node->LinksFrom({quant_dequant_right_node});
-    output_act_right_node->LinksFrom({quant_dequant_right_node});
-
-    auto* quantized_node = OpNode("quantized_node", quantized_op_type_)
-                               ->assert_is_op(quantized_op_type_);
-    quantized_node->LinksFrom({output_act_left_node, output_act_right_node});
-  } else {
-    LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_;
-  }
-  VLOG(4) << "DeleteQuantDequantOpFuser BuildPattern op_type:"
-          << quantized_op_type_;
+  auto* input_scale_node =
+      VarNode("input_scale_node")
+          ->assert_is_op_input(quant_dequant_op_type, "InScale");
+  auto* input_act_node =
+      VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X");
+  auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type)
+                                 ->assert_is_op(quant_dequant_op_type);
+  auto* output_scale_node =
+      VarNode("output_scale_node")
+          ->assert_is_op_output(quant_dequant_op_type, "OutScale");
+  auto* output_act_node =
+      VarNode("output_act_node")
+          ->assert_is_op_output(quant_dequant_op_type, "Out");
+
+  quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
+  output_scale_node->LinksFrom({quant_dequant_node});
+  output_act_node->LinksFrom({quant_dequant_node});
 }
 
 void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
                                               const key2nodes_t& matched) {
-  if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") {
-    auto* input_scale_node = matched.at("input_scale_node");
-    auto* input_act_node = matched.at("input_act_node");
-    auto* quant_dequant_node = matched.at("quant_dequant_node");
-    auto* output_scale_node = matched.at("output_scale_node");
-    auto* output_act_node = matched.at("output_act_node");
-    auto* quantized_node = matched.at("quantized_node");
-
-    // obtain values, save values and relink node
-    int bit_length =
-        quant_dequant_node->stmt()->op_info()->GetAttr<int>("bit_length");
-    int range = ((1 << (bit_length - 1)) - 1);
-    auto* scope = quant_dequant_node->stmt()->op()->scope();
-    auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name)
-                             ->GetMutable<lite::Tensor>();
-    float scale_value = scale_tensor->data<float>()[0] / range;
-
-    auto* op_desc = quantized_node->stmt()->mutable_op_info();
-    op_desc->SetAttr<int>("bit_length", bit_length);
-    op_desc->SetAttr<float>("input_scale", scale_value);
-    op_desc->SetInput("X", {input_act_node->arg()->name});
-    IR_NODE_LINK_TO(input_act_node, quantized_node)
-    auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
-    quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
-
-    // delete nodes and edges
-    std::unordered_set<const Node*> nodes2rm = {input_scale_node,
-                                                quant_dequant_node,
-                                                output_scale_node,
-                                                output_act_node};
-    GraphSafeRemoveNodes(graph, nodes2rm);
-  } else if (quantized_op_type_ == "elementwise_add") {
-    auto* input_scale_left_node = matched.at("input_scale_left_node");
-    auto* input_act_left_node = matched.at("input_act_left_node");
-    auto* quant_dequant_left_node = matched.at("quant_dequant_left_node");
-    auto* output_scale_left_node = matched.at("output_scale_left_node");
-    auto* output_act_left_node = matched.at("output_act_left_node");
-
-    auto* input_scale_right_node = matched.at("input_scale_right_node");
-    auto* input_act_right_node = matched.at("input_act_right_node");
-    auto* quant_dequant_right_node = matched.at("quant_dequant_right_node");
-    auto* output_scale_right_node = matched.at("output_scale_right_node");
-    auto* output_act_right_node = matched.at("output_act_right_node");
-
-    auto* quantized_node = matched.at("quantized_node");
-
-    // obtain values, save values and relink node
-    int bit_length =
-        quant_dequant_left_node->stmt()->op_info()->GetAttr<int>("bit_length");
-    int range = ((1 << (bit_length - 1)) - 1);
-    auto* scope = quant_dequant_left_node->stmt()->op()->scope();
-    auto* left_scale_tensor =
-        scope->FindVar(output_scale_left_node->arg()->name)
-            ->GetMutable<lite::Tensor>();
-    float left_scale_value = left_scale_tensor->data<float>()[0] / range;
-    auto* right_scale_tensor =
-        scope->FindVar(output_scale_right_node->arg()->name)
-            ->GetMutable<lite::Tensor>();
-    float right_scale_value = right_scale_tensor->data<float>()[0] / range;
-
-    auto* op_desc = quantized_node->stmt()->mutable_op_info();
-    op_desc->SetAttr<int>("bit_length", bit_length);
-    op_desc->SetAttr<float>("x_input_scale", left_scale_value);
-    op_desc->SetAttr<float>("y_input_scale", right_scale_value);
-    op_desc->SetInput("X", {input_act_left_node->arg()->name});
-    op_desc->SetInput("Y", {input_act_right_node->arg()->name});
-    IR_NODE_LINK_TO(input_act_left_node, quantized_node)
-    IR_NODE_LINK_TO(input_act_right_node, quantized_node)
-    auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
-    quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
-
-    // delete nodes and edges
-    std::unordered_set<const Node*> nodes2rm = {input_scale_left_node,
-                                                quant_dequant_left_node,
-                                                output_scale_left_node,
-                                                output_act_left_node,
-                                                input_scale_right_node,
-                                                quant_dequant_right_node,
-                                                output_scale_right_node,
-                                                output_act_right_node};
-    GraphSafeRemoveNodes(graph, nodes2rm);
-  } else {
-    LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_;
+  auto* input_scale_node = matched.at("input_scale_node");
+  auto* input_act_node = matched.at("input_act_node");
+  auto* quant_dequant_node = matched.at("quant_dequant_node");
+  auto* output_scale_node = matched.at("output_scale_node");
+  auto* output_act_node = matched.at("output_act_node");
+  auto input_act_name = input_act_node->arg()->name;
+  auto output_act_name = output_act_node->arg()->name;
+
+  // Get scale value from scale var node
+  int bit_length =
+      quant_dequant_node->stmt()->op_info()->GetAttr<int>("bit_length");
+  int range = ((1 << (bit_length - 1)) - 1);
+  auto* scope = quant_dequant_node->stmt()->op()->scope();
+  auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name)
+                           ->GetMutable<lite::Tensor>();
+  float scale_value = scale_tensor->data<float>()[0] / range;
+
+  auto quantized_nodes = output_act_node->outlinks;
+  for (auto* quantized_node : quantized_nodes) {
+    // Save quantization info in op_info attr
+    auto op_info = *quantized_node->stmt()->op_info();
+    std::string argname;
+    int index;
+    op_info.GetInputArgname(output_act_name, &argname);
+    op_info.GetInputIndex(output_act_name, &index);
+    op_info.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
+                           scale_value);
+    op_info.SetAttr<float>("input_scale", scale_value);  // Save it for now
+    op_info.SetAttr<int>("bit_length", bit_length);
+
+    op_info.UpdateAllInputs(output_act_name, input_act_name);
+    quantized_node->stmt()->ResetOp(op_info, graph->valid_places());
+    IR_NODE_LINK_TO(input_act_node, quantized_node);
   }
+  // delete nodes and edges
+  std::unordered_set<const Node*> nodes2rm = {
+      input_scale_node, quant_dequant_node, output_scale_node, output_act_node};
+  GraphSafeRemoveNodes(graph, nodes2rm);
 }
 
 cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h
index bef9f4d9573d049700736c166cd0d31b668f7eff..ac3ac112b3aa504bc075125f2f13292073ca9444 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -87,24 +87,16 @@ class ChannelWiseDequantOpFuser : public FuseBase {
 };
 
 /* The pattern like "fake_quantize_dequantize_moving_average_abs_max +
- * pooled/elementwise_add" can be deteted by this fuser. The fuser
- * extract the input_scale form fake_quant_dequant_op and save into
- * the quantized_op. Besides, the fuser delete fake_quant_dequant_op in
- * the graph.
+ * quantized_op" can be deteted by this fuser. The fuser modifies the input
+ * scale for the quantized_op and deletes the fake_quant_dequant_op.
 */
-
 class DeleteQuantDequantOpFuser : public FuseBase {
  public:
-  explicit DeleteQuantDequantOpFuser(const std::string& quantized_op_type)
-      : quantized_op_type_(quantized_op_type) {}
   void BuildPattern() override;
   void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
 
  private:
   cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-
- private:
-  std::string quantized_op_type_{};
 };
 
 }  // namespace fusion
diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc
index 76c97d2da6ed9e7c6fc1f1889d80095278b68ec0..d7486c0933dbbe74115bd6358962817b2b946c12 100644
--- a/lite/core/mir/generate_program_pass.cc
+++ b/lite/core/mir/generate_program_pass.cc
@@ -14,6 +14,7 @@
 
 #include "lite/core/mir/generate_program_pass.h"
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
@@ -25,10 +26,37 @@ namespace mir {
 
 void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   VLOG(4) << "final program \n" << Visualize(graph.get());
-  for (auto& item : graph->StmtTopologicalOrder()) {
+  std::vector<Node*> nodes_in_order;
+#ifdef LITE_WITH_CUDA
+  const std::string depend_pass = "multi_stream_analysis_pass";
+  const std::string attr_name = "nodes_in_order";
+  mir::Pass* pass = mir::PassManager::Global().LookUp(depend_pass);
+  if (pass->HasAttr(attr_name)) {
+    nodes_in_order = pass->GetAttr<std::vector<Node*>>(attr_name);
+  }
+#endif
+  if (nodes_in_order.empty()) {
+    nodes_in_order = graph->StmtTopologicalOrder();
+  }
+
+  for (auto& item : nodes_in_order) {
     if (item->IsStmt()) {
       auto& stmt = item->AsStmt();
       VLOG(4) << stmt;
+#ifdef LITE_WITH_CUDA
+      if (stmt.kernels().front()->target() == TargetType::kCUDA) {
+        stmt.kernels()
+            .front()
+            ->mutable_context()
+            ->As<CUDAContext>()
+            .SetNeedSync(stmt.need_sync_);
+        stmt.kernels()
+            .front()
+            ->mutable_context()
+            ->As<CUDAContext>()
+            .SetSyncStreams(stmt.sync_streams_);
+      }
+#endif
       insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
     }
   }
diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc
index 28ec814fa85451b5292bfde6bddc6b64b57b2f08..55b7a004567ec5a5298e084839d6dcf5a8591882 100644
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -26,15 +26,13 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
-using inference::analysis::Dot;
-
 void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   VLOG(5) << "\n" << Visualize(graph.get());
 }
 
 std::string Visualize(mir::SSAGraph* graph) {
   std::ostringstream os;
-  inference::analysis::Dot dot;
+  Dot dot;
   auto string_trunc = [](const std::string& str) -> std::string {
     const int max_disp_size = 100;
     if (str.length() > max_disp_size)
@@ -87,7 +85,23 @@ std::string Visualize(mir::SSAGraph* graph) {
     if (!node->IsStmt()) continue;
     auto op_info = node->AsStmt().op_info();
     auto op_type = op_info->Type();
-    std::string op_name = string_format("%s%d", op_type.c_str(), op_idx++);
+    std::string op_name;
+    if (node->AsStmt().need_sync_) {
+      std::ostringstream oss;
+      for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) {
+        oss << std::to_string(node->AsStmt().sync_streams_[i]);
+        if (i != node->AsStmt().sync_streams_.size() - 1) {
+          oss << ",";
+        }
+      }
+      op_name = string_format("%s%d, stream=%d, sync_streams={%s}",
+                              op_type.c_str(),
+                              op_idx++,
+                              node->AsStmt().stream_id_,
+                              oss.str().c_str());
+    } else {
+      op_name = string_format("%s%d", op_type.c_str(), op_idx++);
+    }
     // Add its input&output variables as the Dot nodes
     dot.AddNode(op_name,
                 {Dot::Attr("shape", "box"),
@@ -95,7 +109,13 @@ std::string Visualize(mir::SSAGraph* graph) {
                  Dot::Attr("color", "black"),
                  Dot::Attr("fillcolor", "yellow")});
     for (auto& x : node->inlinks) {
-      auto var_name = x->AsArg().name;
+      std::string var_name;
+      if (x->AsArg().lane != -1) {
+        var_name = string_format(
+            "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane);
+      } else {
+        var_name = x->AsArg().name;
+      }
       if (!exists_var_names.count(var_name)) {
         dot.AddNode(var_name, {});
         exists_var_names.insert(var_name);
@@ -103,7 +123,13 @@ std::string Visualize(mir::SSAGraph* graph) {
       dot.AddEdge(var_name, op_name, {});
     }
     for (auto& x : node->outlinks) {
-      auto var_name = x->AsArg().name;
+      std::string var_name;
+      if (x->AsArg().lane != -1) {
+        var_name = string_format(
+            "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane);
+      } else {
+        var_name = x->AsArg().name;
+      }
       if (!exists_var_names.count(var_name)) {
         dot.AddNode(var_name, {});
         exists_var_names.insert(var_name);
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index 38293ede76ed35bf05767ce1333947b7dfdbc4ac..6c7a7c5803268f0729be3a1d2164c0598c8738bd 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -313,4 +313,4 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
     .BindTargets({TARGET(kARM), TARGET(kOpenCL)})
-    .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
+    .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM), TARGET(kRKNPU)});
diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc
index 515eab9d3f20ebf85c2c5abad6d84f109ec68068..03d3c5056031c0604e706157fd509508dcd5ea8d 100644
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -562,20 +562,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
 }
 
 void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  // currently for non-persistent input and output args, mlu subgraph op
-  // only support float16/float32 data type
-
-  // in two situations as folllows:
-  // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
-  // arg_in and arg_out are assumed to be NHWC which user should be aware of.
-  // Thus here we change these args' layout to NHWC
-  if (lite::TargetWrapperMlu::InputLayout() == DATALAYOUT(kNHWC)) {
+// currently for non-persistent input and output args, mlu subgraph op
+// only support float16/float32 data type
+
+// in two situations as folllows:
+// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
+// arg_in and arg_out are assumed to be NHWC which user should be aware of.
+// Thus here we change these args' layout to NHWC
+#ifdef LITE_WITH_MLU
+  if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
     ModifyLayout(graph.get());
   }
 
-  if (lite::TargetWrapperMlu::UseFirstConv()) {
+  if (lite::DeviceInfo::Global().UseFirstConv()) {
     GatherAndModifyFirstConvNodes(graph.get());
   }
+#endif
 
   // insert io_copy, layout and precision cast of subgraph's inputs and outputs
   for (auto& node : graph->mutable_nodes()) {
diff --git a/lite/core/mir/multi_stream_analysis_pass.cc b/lite/core/mir/multi_stream_analysis_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46454a1fc357c7d96162a58a43a6c34bc890bc69
--- /dev/null
+++ b/lite/core/mir/multi_stream_analysis_pass.cc
@@ -0,0 +1,313 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/multi_stream_analysis_pass.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lite/core/device_info.h"
+#include "lite/core/mir/graph_visualize_pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void MultiStreamAnalysisPass::CleanUp() {
+  exec_ops_.clear();
+  wait_que_.clear();
+  wait_que_cpu_.clear();
+  std::queue<int> empty_queue;
+  while (!exec_que_.empty()) {
+    exec_que_.pop();
+  }
+  ops_in_streams_.clear();
+  resources_.clear();
+  map_arg_to_lane_.clear();
+  op_types_set_.clear();
+  io_copy_once_num_ = 0;
+}
+
+void MultiStreamAnalysisPass::Init(SSAGraph* graph) {
+  // If not cleaned, the clone will overlay the previous state
+  CleanUp();
+  for (auto& op_node : graph->StmtTopologicalOrder()) {
+    if (op_node->IsStmt()) {
+      // Set all outputs of op to inaccessible state.
+      auto outputs = op_node->outlinks;
+      for (Node* node : outputs) {
+        CHECK(node->IsArg());
+        auto& arg = node->AsArg();
+        if (!resources_.count(arg.name)) {
+          resources_[arg.name] = false;
+        }
+      }
+      // Set the weight input of op to be accessible.
+      auto inputs = op_node->inlinks;
+      for (Node* node : inputs) {
+        CHECK(node->IsArg());
+        auto& arg = node->AsArg();
+        if (arg.is_weight || arg.is_persist) {
+          resources_[arg.name] = true;
+        }
+      }
+
+      // feed and io_copy_once op has no dependencies and can be launched
+      // directly. Other ops are put into the waiting queue.
+      if (op_node->AsStmt().op_type() == "feed" ||
+          op_node->AsStmt().op_type() == "io_copy_once") {
+        exec_que_.push(op_node);
+      } else {
+        auto tgt = op_node->AsStmt().kernels().front()->target();
+        if (tgt == TargetType::kCUDA) {
+          wait_que_.push_back(op_node);
+        } else {
+          wait_que_cpu_.push_back(op_node);
+        }
+      }
+      op_types_set_.insert(op_node->AsStmt().op_type());
+    }
+  }
+
+  // Set the stream id according to the number of feed ops, and set the output
+  // of the feed op to be accessible.
+  int lane = 0;
+  auto nodes = graph->inputs();
+  ops_in_streams_.resize(max_stream_);
+
+  for (auto& node : nodes) {
+    std::string::size_type idx = node->AsArg().name.find("feed");
+    if (idx != std::string::npos) {
+      for (auto& feed_ops : node->outlinks) {
+        if (feed_ops->AsStmt().op_type() == "feed") {
+          // feed op doesn't need to wait sync.
+          feed_ops->AsStmt().need_sync_ = false;
+          CHECK_EQ(static_cast<int>(feed_ops->outlinks.size()), 1)
+              << "feed op must have one output.";
+          for (auto& var : feed_ops->outlinks) {
+            var->AsArg().lane = lane;
+            map_arg_to_lane_[var->AsArg().name] = lane;
+            resources_[var->AsArg().name] = true;
+          }
+          feed_ops->AsStmt().stream_id_ = lane;
+          ops_in_streams_[lane].push_back(feed_ops);
+          ++lane;
+          if (lane >= max_stream_) {
+            lane = 0;
+          }
+        }
+      }
+    }
+    // set all io_copy_once op in the first stream
+    for (auto& io_copy_once_ops : node->outlinks) {
+      if (io_copy_once_ops->AsStmt().op_type() == "io_copy_once") {
+        ops_in_streams_[0].push_back(io_copy_once_ops);
+        io_copy_once_ops->AsStmt().stream_id_ = 0;
+        io_copy_once_ops->AsStmt().need_sync_ = false;
+        ++io_copy_once_num_;
+      }
+    }
+  }
+}
+
+bool MultiStreamAnalysisPass::CheckOpSupport() {
+  std::unordered_set<std::string> invalid_op = {
+      "while", "conditional_block", "conditional_block_infer", "graph_op"};
+  for (auto& op_type : op_types_set_) {
+    if (invalid_op.count(op_type)) {
+      LOG(INFO) << "multi_stream_analysis_pass don't support " << op_type
+                << ", just return.";
+      return false;
+    }
+  }
+  return true;
+}
+
+bool MultiStreamAnalysisPass::IsPrepared(Node* stmt_node) {
+  // feed op are prepared when init.
+  std::string op_name = stmt_node->AsStmt().op_type();
+  if (op_name == "feed") {
+    return true;
+  }
+
+  // Check is op's input are all accessible.
+  std::vector<std::string> args;
+  for (auto* ins : stmt_node->inlinks) {
+    args.push_back(ins->AsArg().name);
+  }
+  return CheckAccess(args);
+}
+
+bool MultiStreamAnalysisPass::CheckAccess(
+    const std::vector<std::string>& args) {
+  if (args.size() == 0) {
+    return true;
+  }
+  for (auto& name : args) {
+    if (resources_[name]) {
+      continue;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+int MultiStreamAnalysisPass::SelectStreamId(const std::vector<int>& lanes) {
+  if (lanes.size() == 0) {
+    return 0;
+  }
+
+  int res = lanes[0];
+  int exclude_io_copy_once_num = ops_in_streams_[0].size() - io_copy_once_num_;
+  int min_num = lanes[0] == 0 ? exclude_io_copy_once_num
+                              : ops_in_streams_[lanes[0]].size();
+  for (size_t i = 1; i < lanes.size(); ++i) {
+    int ith_num = lanes[i] == 0 ? exclude_io_copy_once_num
+                                : ops_in_streams_[lanes[i]].size();
+    if (ith_num < min_num) {
+      res = lanes[i];
+      min_num = ith_num;
+    }
+  }
+
+  return res;
+}
+
+void MultiStreamAnalysisPass::Launch(Node* stmt_node) {
+  // record ops launch order.
+  exec_que_.push(stmt_node);
+  std::vector<int> lanes;
+  for (auto& in_arg : stmt_node->inlinks) {
+    // Weight parameter does not involve stream id, so just skip it.
+    if (in_arg->AsArg().is_weight || in_arg->AsArg().is_persist) {
+      continue;
+    }
+
+    if (std::find(lanes.begin(), lanes.end(), in_arg->AsArg().lane) ==
+        lanes.end()) {
+      lanes.push_back(in_arg->AsArg().lane);
+    }
+  }
+
+  int stream_id = SelectStreamId(lanes);
+
+  // If all inputs of the op are on multiple streams, they need to be
+  // synchronized
+  if (lanes.size() > 1) {
+    for (size_t i = 0; i < lanes.size(); ++i) {
+      if (lanes[i] != stream_id) {
+        stmt_node->AsStmt().sync_streams_.push_back(lanes[i]);
+      }
+    }
+    stmt_node->AsStmt().need_sync_ = true;
+  }
+  // io_copy are nodes inserted across devices and need to be synced.
+  if (stmt_node->AsStmt().op_type() == "io_copy") {
+    stmt_node->AsStmt().need_sync_ = true;
+  }
+  stmt_node->AsStmt().stream_id_ = stream_id;
+
+  // set output lane and set the output of op to be accessible.
+  for (auto& out_arg : stmt_node->outlinks) {
+    out_arg->AsArg().lane = stream_id;
+    resources_[out_arg->AsArg().name] = true;
+  }
+  ops_in_streams_[stream_id].push_back(stmt_node);
+}
+
+void MultiStreamAnalysisPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+#ifdef LITE_WITH_CUDA
+  typename Env<TargetType::kCUDA>::Devs& devs =
+      Env<TargetType::kCUDA>::Global();
+  int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
+  max_stream_ = devs[dev_id].max_stream();
+#else
+  LOG(FATAL) << "Please re-compile by setting the cmake flag LITE_WITH_CUDA=ON";
+#endif
+
+  // Find the correct startup sequence for op.
+  Init(graph.get());
+  bool is_valid = CheckOpSupport();
+  if (!is_valid) {
+    return;
+  }
+  size_t prev_size;
+
+  while (!(this->wait_que_.empty() && this->wait_que_cpu_.empty())) {
+    prev_size = this->wait_que_.size() + this->wait_que_cpu_.size();
+    // launch the acessible cuda kernel and remove it from wait que.
+    for (auto it = this->wait_que_.begin(); it != this->wait_que_.end();) {
+      if (IsPrepared(*it)) {
+        Launch(*it);
+        it = wait_que_.erase(it);
+      } else {
+        ++it;
+      }
+    }
+    // launch the accessible cpu kernel and remove it from wait que.
+    for (auto cpu_it = this->wait_que_cpu_.begin();
+         cpu_it != this->wait_que_cpu_.end();) {
+      if (IsPrepared(*cpu_it)) {
+        Launch(*cpu_it);
+        cpu_it = wait_que_cpu_.erase(cpu_it);
+      } else {
+        ++cpu_it;
+      }
+    }
+
+    if (this->wait_que_.size() + this->wait_que_cpu_.size() == prev_size) {
+      LOG(FATAL) << "network topo error!";
+    }
+  }
+
+  // Get exec ops order.
+  while (!exec_que_.empty()) {
+    auto* node = exec_que_.front();
+    exec_ops_.push_back(node);
+    VLOG(4) << node->AsStmt().op_type()
+            << " stream: " << node->AsStmt().stream_id_
+            << ", sync: " << node->AsStmt().need_sync_;
+    if (node->AsStmt().need_sync_) {
+      for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) {
+        VLOG(4) << "        " << node->AsStmt().sync_streams_[i];
+      }
+    }
+    exec_que_.pop();
+  }
+
+  // Set attribute parameters, for passing parameters between passes
+  const std::string attr_name{"nodes_in_order"};
+  SetAttr<std::vector<Node*>>(attr_name, &exec_ops_);
+
+  LOG(INFO) << "stream " << 0 << " has "
+            << ops_in_streams_[0].size() - io_copy_once_num_
+            << " ops. (exclude io_copy_once).";
+  for (size_t i = 1; i < ops_in_streams_.size(); ++i) {
+    LOG(INFO) << "stream " << i << " has " << ops_in_streams_[i].size()
+              << " ops.";
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(multi_stream_analysis_pass,
+                  paddle::lite::mir::MultiStreamAnalysisPass)
+    .BindTargets({TARGET(kCUDA)});
diff --git a/lite/core/mir/multi_stream_analysis_pass.h b/lite/core/mir/multi_stream_analysis_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..37a7feca3a1200ad7ff26ef8fc0317deee9d174e
--- /dev/null
+++ b/lite/core/mir/multi_stream_analysis_pass.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "lite/core/kernel.h"
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+/*
+ * MultiStreamAnalysisPass will find the correct launch sequence for all ops.
+ * Ideally, the order should be multiple asynchronous ops and a small number of
+ * synchronous ops.
+ */
+class MultiStreamAnalysisPass : public StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+
+ private:
+  // Init resource list. Set all ops except feed to inaccessible state and set
+  // stream id according to the numer of inputs.
+  void Init(SSAGraph* graph);
+
+  // Clean state information of all member variables.
+  void CleanUp();
+
+  // After launching, unlock the output resources of op.
+  void Launch(Node* stmt_node);
+
+  // If all inputs of an op are accessible, the op is considered to be in the
+  // prepared state
+  bool IsPrepared(Node* stmt_node);
+
+  // Determine if all inputs of op are accessible.
+  bool CheckAccess(const std::vector<std::string>& args);
+
+  // The logic of selecting a stream:
+  // 1. Make the number of ops on each stream as close as possible.
+  // 2. The selected stream must be one of the streams contained in the input
+  // arg
+  int SelectStreamId(const std::vector<int>& lanes);
+
+  // Check if the model's ops are all supported. If you encounter unsupported
+  // ops, exit
+  bool CheckOpSupport();
+
+ private:
+  std::list<Node*> wait_que_;
+  std::list<Node*> wait_que_cpu_;
+  std::queue<Node*> exec_que_;
+  std::vector<Node*> exec_ops_;
+  std::vector<std::vector<Node*>> ops_in_streams_;
+  std::unordered_map<std::string, bool> resources_;
+  std::unordered_map<std::string, int> map_arg_to_lane_;
+  int max_stream_;
+  int io_copy_once_num_;
+  std::unordered_set<std::string> op_types_set_;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h
index 45b15812fadb0789edea3f89fb00b4612bdb010f..ae7b112d9157de3f53c409dfc89bf1273531e05f 100644
--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -80,6 +80,12 @@ class Node {
 
     // Description.
     std::string desc;
+
+    // for cuda multi stream
+    bool need_sync_{false};
+    int stream_id_{0};
+    // streams which need to be sync. exclude stream_id_
+    std::vector<int> sync_streams_{};
   };
 
   struct Arg {
@@ -93,6 +99,7 @@ class Node {
     // if the need more than one tool operator(eg. io_copy layout calib), the
     // argument between them should be persist to make sure it's only run once
     bool is_persist{false};
+    int lane{-1};
   };
 
   Arg& AsArg(const std::string& name, int id);
diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h
index 4e8c8be292bbd5e7f46664378634d4f1aeed2965..64f2db82c0b1b0b863c1aa61b3b2affea5f85d89 100644
--- a/lite/core/mir/pass.h
+++ b/lite/core/mir/pass.h
@@ -17,9 +17,11 @@
 #include <set>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 #include "lite/core/mir/node.h"
 #include "lite/core/mir/ssa_graph.h"
+#include "lite/utils/varient.h"
 
 namespace paddle {
 namespace lite {
@@ -121,6 +123,27 @@ class Pass {
 
   virtual ~Pass() = default;
 
+  bool HasAttr(const std::string& attr_name) const {
+    return pass_attrs_.count(attr_name) > 0;
+  }
+
+  // Set a pointer to the attribute. Specific pass itself takes ownership of the
+  // attribute.
+  template <typename AttrType>
+  void SetAttr(const std::string& attr_name, const AttrType* attr) {
+    VLOG(4) << "Setting the attribute " << attr_name << " for the pass "
+            << name_;
+    pass_attrs_[attr_name].set<const AttrType>(*attr);
+  }
+
+  // Get a reference to the attribute previously set.
+  template <typename AttrType>
+  const AttrType& GetAttr(const std::string& attr_name) const {
+    CHECK(pass_attrs_.count(attr_name))
+        << attr_name << " attr not register for pass " << name_;
+    return pass_attrs_.at(attr_name).get<const AttrType>();
+  }
+
  private:
   const Kind kind_;
   std::string name_;
@@ -128,6 +151,8 @@ class Pass {
   std::set<TargetType> bound_targets_;
   std::set<TargetType> excluded_targets_;
   std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_;
+  std::unordered_map<std::string, variant<Node, std::vector<Node*>>>
+      pass_attrs_;
 };
 
 // Different kinds.
diff --git a/lite/core/mir/pass_registry.h b/lite/core/mir/pass_registry.h
index 849f80aea2191b72ac423c7125a4e69cb6927be5..170de1cd31ffd31662eb98898ad795993a36289e 100644
--- a/lite/core/mir/pass_registry.h
+++ b/lite/core/mir/pass_registry.h
@@ -59,6 +59,9 @@ class PassRegistry {
 }  // namespace lite
 }  // namespace paddle
 
+// some platform-independent defintion
+#include "lite/utils/macros.h"
+
 #define REGISTER_MIR_PASS(name__, class__)                                \
   paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__,      \
                                                             new class__); \
@@ -66,4 +69,4 @@ class PassRegistry {
     return mir_pass_registry##name__.Touch();                             \
   }                                                                       \
   static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__  \
-      __attribute__((unused)) = mir_pass_registry##name__
+      UNUSED = mir_pass_registry##name__
diff --git a/lite/core/mir/pattern_matcher.cc b/lite/core/mir/pattern_matcher.cc
index b625919cbfb6d26ecbbd1bad36772aff86bee087..aaebf852b2ec519515e59655a57600f59ec6a2c3 100644
--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
@@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector<subgraph_t> *subgraphs) {
 }
 
 std::string PMPattern::DotString() const {
-  using inference::analysis::Dot;
   Dot dot;
   int id = 0;
   // Create Nodes
diff --git a/lite/core/mir/pattern_matcher_high_api.h b/lite/core/mir/pattern_matcher_high_api.h
index e62a4fc7494d750b2b5331c4b54b787df239ceee..3ac8e331aacb28044fca7f328319de37b27950bf 100644
--- a/lite/core/mir/pattern_matcher_high_api.h
+++ b/lite/core/mir/pattern_matcher_high_api.h
@@ -64,7 +64,6 @@ class FuseBase {
  protected:
   virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0;
 
- private:
   void PerformPatternMatcher(SSAGraph* graph);
 
   // Delete nodes that are marked as Intermediate
diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc
index 40cad8f6af75300ab85753b16e391daeeadc6c2f..37fff018caf4a6d90a48ad3f173ec28c09866690 100644
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply(
 
 REGISTER_MIR_PASS(quantized_op_attributes_inference_pass,
                   paddle::lite::mir::QuantizedOpAttributesInferencePass)
-    .BindTargets({TARGET(kNPU)});
+    .BindTargets({TARGET(kNPU), TARGET(kRKNPU)});
diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc
index 97c4819eaf6734ba9b374444166d17cb15e8ae65..5b6f968484b7b49838a004c3edfd00ff9b7e5e5e 100644
--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ b/lite/core/mir/runtime_context_assign_pass.cc
@@ -24,11 +24,32 @@ class RuntimeContextAssignPass : public StmtPass {
   RuntimeContextAssignPass() {}
 
   void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+#ifdef LITE_WITH_OPENCL
+    using OpenCLContext = Context<TargetType::kOpenCL>;
+    std::unique_ptr<KernelContext> local_ctx(new KernelContext());
+    local_ctx->As<OpenCLContext>().InitOnce();
+#endif
     for (auto& node : graph->mutable_nodes()) {
       if (!node.IsStmt()) continue;
       auto& inst = node.AsStmt();
-      inst.picked_kernel().SetContext(
-          ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
+
+#ifdef LITE_WITH_OPENCL
+      if (inst.picked_kernel().target() == TARGET(kOpenCL)) {
+        std::unique_ptr<KernelContext> ctx(new KernelContext());
+        (*local_ctx)
+            .As<OpenCLContext>()
+            .CopySharedTo(&ctx->As<OpenCLContext>());
+        inst.picked_kernel().SetContext(std::move(ctx));
+      } else {
+        inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            inst.picked_kernel().target()));
+      }
+#else
+      int stream_id = inst.stream_id_;
+
+      inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+          inst.picked_kernel().target(), stream_id));
+#endif
     }
   }
 };
diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc
index a2fe31cca72b02b1ac97dac37d51cebb5bb89128..54f5f4d46ce465d9db78b43f339296a3135c9507 100644
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -251,9 +251,10 @@ std::vector<mir::Node *> SSAGraph::outputs() {
 }
 
 mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) {
-  auto it = arguments_.find(arg);
-  if (it != arguments_.end()) {
-    return it->second;
+  for (auto &node : node_storage_) {
+    if (node.IsArg() && node.arg()->name == arg) {
+      return &node;
+    }
   }
   return nullptr;
 }
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
index 9799cc72437c7581bde681ef2e80c0234635c2fe..b61f7f365f51a32e267dd12943be5fcfadb3e08a 100644
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -30,10 +30,8 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
-using inference::analysis::Dot;
-
 std::string SubgraphVisualizer::operator()() {
-  inference::analysis::Dot dot;
+  Dot dot;
   const std::vector<std::string> subgraph_colors{
       "red",          "green",          "cyan",           "bisque3",
       "coral",        "darkseagreen1",  "goldenrod1",     "darkorchid",
@@ -314,14 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) {
 
 std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
     node_map_t *nodes) {
-  for (auto &n_tpo : graph_->NodeTopologicalOrder()) {
+  for (auto &ordered_node : graph_->NodeTopologicalOrder()) {
     // different orders when traversing nodes in graph may lead to
     // different subgraph division, which may generate different result
     // with device such as MLU. These different results are all "right"
     // but a little confusing. Thus the topological order is used instead
     // of the address of the node in graph.
-    CHECK(nodes->find(n_tpo) != nodes->end());
-    node_dat_t *node = (*nodes)[n_tpo];
+    CHECK(nodes->find(ordered_node) != nodes->end());
+    node_dat_t *node = (*nodes)[ordered_node];
     if (!node->marked) {
       continue;
     }
diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc
index 974772a9839c1e089359be3ae98e1833645ccd7a..1e54e1497b5d49754a705340aafa30ded1c2a727 100644
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) {
 #ifdef LITE_WITH_NPU
       Place{TARGET(kNPU), PRECISION(kFloat)},
 #endif
-#ifdef LITE_WITH_XPU
+#ifdef LITE_WITH_XTCL
       Place{TARGET(kXPU), PRECISION(kFloat)},
 #endif
   });
diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc
index 45cf142f2f831fae11b6258b78dc24818c3a8988..5c5dc3204b8728e8b30661fae21b056db6960179 100644
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -20,6 +20,7 @@
 #include <vector>
 #include "lite/core/mir/pass_registry.h"
 #include "lite/core/mir/subgraph/subgraph_detector.h"
+#include "lite/utils/env.h"
 
 namespace paddle {
 namespace lite {
@@ -40,6 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }
 
 void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
   std::unordered_set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
 #include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
@@ -67,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   fuser();
 }
 
+void RKNPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   std::unordered_set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
@@ -91,5 +107,7 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
     .BindTargets({TARGET(kXPU)});
 REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
     .BindTargets({TARGET(kBM)});
+REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass)
+    .BindTargets({TARGET(kRKNPU)});
 REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
     .BindTargets({TARGET(kMLU)});
diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h
index f83448df42ffe6d6d8c5b37503b5127290037dce..b89c20f3bd4b7ca8e9650d20925f5b75dc26ec59 100644
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
 
+class RKNPUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 class MLUSubgraphPass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index 7117e1b3399fe823194f7f1a4d4c239099580955..a2369adc5d882310503cbf52fa5394098d824b40 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -180,7 +180,7 @@ TEST(Subgraph, generate_model_and_check_precision) {
 #ifdef LITE_WITH_NPU
   valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
 #endif
-#ifdef LITE_WITH_XPU
+#ifdef LITE_WITH_XTCL
   valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
 #endif
   auto tar_predictor = TestModel(FLAGS_model_dir,
diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc
index ecccf89fa76287a3f30756f7138fcce229e8f337..121e64dc188eeb638becec3506b514bc24dad16d 100644
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -80,7 +80,7 @@ static bool InferScaleFromSubgraph(std::string var_name,
   auto input_or_output_scales = op_info->GetAttr<std::vector<float>>(attr_name);
   auto size = input_or_output_names.size();
   CHECK(size == input_or_output_scales.size());
-  for (int i = 0; i < size; i++) {
+  for (size_t i = 0; i < size; i++) {
     if (input_or_output_names[i] == var_name) {
       *scale = input_or_output_scales[i];
       return true;
@@ -137,18 +137,23 @@ void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     nodes.push_back(node);
   }
 
+  // record the copied node.
+  std::unordered_map<std::string, Node*> cast_nodes;
+
   for (auto& node : nodes) {
     if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
     auto inlinks = node->inlinks;
     for (auto* in : inlinks) {
-      ComplementInputs(graph.get(), node, in);
+      ComplementInputs(graph.get(), node, in, &cast_nodes);
     }
   }
 }
 
-void PrecisionCastPass::ComplementInputs(SSAGraph* graph,
-                                         Node* inst_node,
-                                         Node* in) {
+void PrecisionCastPass::ComplementInputs(
+    SSAGraph* graph,
+    Node* inst_node,
+    Node* in,
+    std::unordered_map<std::string, Node*>* cast_nodes) {
   // If this input is out of date.
   if (inst_node->inlinks.end() ==
       std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in))
@@ -184,16 +189,19 @@ void PrecisionCastPass::ComplementInputs(SSAGraph* graph,
                 in,
                 graph,
                 inst_node,
+                cast_nodes,
                 graph->valid_places());
   }
 }
 
-void PrecisionCastPass::AddCastInst(const Type& from,
-                                    const Type& to,
-                                    Node* in,
-                                    SSAGraph* graph,
-                                    Node* inst_node,
-                                    const std::vector<Place>& valid_places) {
+void PrecisionCastPass::AddCastInst(
+    const Type& from,
+    const Type& to,
+    Node* in,
+    SSAGraph* graph,
+    Node* inst_node,
+    std::unordered_map<std::string, Node*>* cast_nodes,
+    const std::vector<Place>& valid_places) {
   CHECK(!valid_places.empty()) << "valid_place should be set";
 
   // var -> new_transform_op -> new_var -> inst
@@ -203,66 +211,80 @@ void PrecisionCastPass::AddCastInst(const Type& from,
   auto cast_op_output_name = in->AsArg().name + "/precision_trans";
   // in->AsArg().name + "/precision_trans/" +
   // paddle::lite::to_string(node_id());
-  auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
-  cast_op_output_arg->AsArg().type =
-      LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
-  auto* cast_inst = graph->NewInstructNode();
+  if (cast_nodes->count(in->AsArg().name)) {
+    // Remove the old link
+    RemoveDirectedLink(in, inst_node);
+    // Update the original instruction OpDesc.
+    // Update its input to the cast_op_output_name
+    // Add new link, newarg->inst
+    DirectedLink(cast_nodes->at(in->AsArg().name),
+                 inst_node);  // [io_copy kernel]'s output -> [current kernel]
+    // reset opdesc and update kernel information
+    UpdateInputs(
+        inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
+  } else {
+    auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
+    cast_op_output_arg->AsArg().type =
+        LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
+    auto* cast_inst = graph->NewInstructNode();
 
-  // create Op and kernels.
-  bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
-  std::string cast_type = in_persist ? "calib_once" : "calib";
-  cast_op_output_arg->AsArg().is_persist = in_persist;
-  auto cast_op = LiteOpRegistry::Global().Create(cast_type);
-  CHECK(cast_op) << "create op [" << cast_op << "] failed";
+    // create Op and kernels.
+    bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
+    std::string cast_type = in_persist ? "calib_once" : "calib";
+    cast_op_output_arg->AsArg().is_persist = in_persist;
+    auto cast_op = LiteOpRegistry::Global().Create(cast_type);
+    CHECK(cast_op) << "create op [" << cast_op << "] failed";
 
-  // Create the new var manually.
-  inst_node->AsStmt().op()->scope()->Var(cast_op_output_name);
+    // Create the new var manually.
+    inst_node->AsStmt().op()->scope()->Var(cast_op_output_name);
 
-  // Create Calib Instruction.
-  cpp::OpDesc op_desc;
-  op_desc.SetType(cast_type);
-  op_desc.SetInput("Input", {in->AsArg().name});
-  op_desc.SetOutput("Out", {cast_op_output_name});
-  float scale;
-  if (InferScale(in, inst_node, &scale)) {
-    op_desc.SetAttr("scale", scale);
-  }
+    // Create Calib Instruction.
+    cpp::OpDesc op_desc;
+    op_desc.SetType(cast_type);
+    op_desc.SetInput("Input", {in->AsArg().name});
+    op_desc.SetOutput("Out", {cast_op_output_name});
+    float scale;
+    if (InferScale(in, inst_node, &scale)) {
+      op_desc.SetAttr("scale", scale);
+    }
 
-  cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
-  auto kernels = cast_op->CreateKernels(valid_places);
-  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
-  bool is_found = false;
-  for (auto& kernel : kernels) {
-    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
-    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-    if (TypeCompatible(*in_arg_ty, from) &&
-        out_arg_ty->precision() == to.precision()) {
-      is_found = true;
-      selected_kernels.emplace_back(std::move(kernel));
-      // we pick the kernel
-      cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op);
-      break;
+    cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+    auto kernels = cast_op->CreateKernels(valid_places);
+    std::vector<std::unique_ptr<KernelBase>> selected_kernels;
+    bool is_found = false;
+    for (auto& kernel : kernels) {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (TypeCompatible(*in_arg_ty, from) &&
+          out_arg_ty->precision() == to.precision()) {
+        is_found = true;
+        selected_kernels.emplace_back(std::move(kernel));
+        // we pick the kernel
+        cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op);
+        (*cast_nodes)[in->AsArg().name] = cast_op_output_arg;
+        break;
+      }
     }
-  }
 
-  CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":"
-                  << in->AsArg().name << "->" << to << ":"
-                  << inst_node->AsStmt().op_info()->Type();
+    CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":"
+                    << in->AsArg().name << "->" << to << ":"
+                    << inst_node->AsStmt().op_info()->Type();
 
-  // Remove the old link
-  RemoveDirectedLink(in, inst_node);
+    // Remove the old link
+    RemoveDirectedLink(in, inst_node);
 
-  // Update the original instruction OpDesc.
-  // Update its input to the io_copy_output_name
+    // Update the original instruction OpDesc.
+    // Update its input to the io_copy_output_name
 
-  // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
-  DirectedLink(in, cast_inst);
-  DirectedLink(cast_inst, cast_op_output_arg);
-  DirectedLink(cast_op_output_arg, inst_node);
+    // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
+    DirectedLink(in, cast_inst);
+    DirectedLink(cast_inst, cast_op_output_arg);
+    DirectedLink(cast_op_output_arg, inst_node);
 
-  // reset opdesc and update kernel information
-  UpdateInputs(
-      inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
+    // reset opdesc and update kernel information
+    UpdateInputs(
+        inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
+  }
 
   // recreate the op
   auto original_selected_kernel =
diff --git a/lite/core/mir/type_precision_cast_pass.h b/lite/core/mir/type_precision_cast_pass.h
index b5f7c5d902a998e369f0b1775c59f50cbf8dc256..d8d6af5fcd06c187029c7c16a74efade0d4bd5ca 100644
--- a/lite/core/mir/type_precision_cast_pass.h
+++ b/lite/core/mir/type_precision_cast_pass.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "lite/core/mir/pass.h"
 #include "lite/core/op_registry.h"
@@ -34,13 +35,17 @@ class PrecisionCastPass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 
-  void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in);
+  void ComplementInputs(SSAGraph* graph,
+                        Node* inst_node,
+                        Node* in,
+                        std::unordered_map<std::string, Node*>* cast_nodes);
 
   void AddCastInst(const Type& from,
                    const Type& to,
                    Node* in,
                    SSAGraph* graph,
                    Node* inst_node,
+                   std::unordered_map<std::string, Node*>* cast_nodes,
                    const std::vector<Place>& valid_places);
 
   void SetValidPlaces(const std::vector<Place>& valid_places);
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
index 75d8022d5f5f9d8572a5e020c11ae5d8cf630c10..aca7343c8af39f767c2a336e0b298995731b755f 100644
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -180,7 +180,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
         VLOG(4) << "picked, opencl found";
         is_found = true;
       } else if (TypeCompatible(*in_arg_ty, from) &&
-                 out_arg_ty->target() == to.target()) {
+                 TargetCompatibleTo(*out_arg_ty, to)) {
         VLOG(4) << "picked";
         is_found = true;
       }
diff --git a/lite/core/mir/weight_quantization_preprocess_pass.cc b/lite/core/mir/weight_quantization_preprocess_pass.cc
index c7889a54903f2a1d194fb3eade0bd92670b36699..2bb247871b9500129eeea855677a907cb4fd88b9 100644
--- a/lite/core/mir/weight_quantization_preprocess_pass.cc
+++ b/lite/core/mir/weight_quantization_preprocess_pass.cc
@@ -22,9 +22,29 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
+bool IsAbsMaxQuantizedOp(const OpInfo& op_info) {
+  bool result = false;
+  if (op_info.HasAttr("quantization_type") &&
+      op_info.GetAttr<std::string>("quantization_type") ==
+          "post_weight_abs_max") {
+    result = true;
+  } else if (!op_info.HasAttr("quantization_type") &&
+             op_info.HasAttr("quantize_weight_bits")) {  // Support older model,
+                                                         // save this for now
+    result = true;
+  }
+  return result;
+}
+
+/*
+ * For abs_max method in WeightQuantization, this pass obtains the scale value
+ * of conv2d, depthwise_conv2d and mul, expands the scale list, and save the
+ * list in the quantized ops.
+*/
 void WeightQuantizationPreprocessPass::Apply(
     const std::unique_ptr<SSAGraph>& graph) {
-  std::vector<std::string> weight_quantized_op = {"conv2d", "depthwise_conv2d"};
+  std::vector<std::string> weight_quantized_op = {
+      "conv2d", "depthwise_conv2d", "mul"};
   for (auto& node : graph->StmtTopologicalOrder()) {
     if (node->IsStmt() &&
         std::find(weight_quantized_op.begin(),
@@ -32,14 +52,20 @@ void WeightQuantizationPreprocessPass::Apply(
                   node->AsStmt().op_type()) != weight_quantized_op.end()) {
       auto* scope = node->stmt()->op()->scope();
       auto* op_desc = node->stmt()->mutable_op_info();
-      if (op_desc->HasAttr("quantize_weight_bits")) {
+      if (IsAbsMaxQuantizedOp(*op_desc)) {
         for (auto& input_name : op_desc->input_vars()) {
           std::string scale_name = input_name + "_quant_scale";
           if (op_desc->HasAttr(scale_name)) {
-            VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name;
+            VLOG(0) << " WeightQuantizationPreprocessPass op:"
+                    << op_desc->Type() << " input_name:" << input_name;
             auto input_tensor =
                 scope->FindVar(input_name)->GetMutable<lite::Tensor>();
-            int weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
+            int weight_out_channel;
+            if (op_desc->Type() == "mul") {
+              weight_out_channel = static_cast<int>(input_tensor->dims()[1]);
+            } else {
+              weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
+            }
             auto input_scale = op_desc->GetAttr<std::vector<float>>(scale_name);
             // scale length is equal to weight out channel
             std::vector<float> scale_list(weight_out_channel, input_scale[0]);
diff --git a/lite/core/mir/weight_quantization_preprocess_pass.h b/lite/core/mir/weight_quantization_preprocess_pass.h
index 76a35c6b443c692ec08688abd4c10680be62b8af..e7c9f03eef78bdafea204d30c78cf0d044bb15e9 100644
--- a/lite/core/mir/weight_quantization_preprocess_pass.h
+++ b/lite/core/mir/weight_quantization_preprocess_pass.h
@@ -25,8 +25,9 @@ namespace mir {
  * If the model is quantized by WeightQuantization in PostTrainingQuantization,
  * the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is
  * int, and the scale is save in the quantized ops.
- * WeightQuantizationPreprocessPass obtains the scale value, expands the
- * scale value to a list, and save the list in the quantized ops.
+ * For abs_max method in WeightQuantization, WeightQuantizationPreprocessPass
+ * obtains the scale value of conv2d, depthwise_conv2d and mul, expands the
+ * scale list, and save the list in the quantized ops.
  */
 class WeightQuantizationPreprocessPass : public ProgramPass {
  public:
diff --git a/lite/core/mir/xpu_pattern_matcher.cc b/lite/core/mir/xpu_pattern_matcher.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f268e7af8a55d22163d52c7f8824406f58bd17b
--- /dev/null
+++ b/lite/core/mir/xpu_pattern_matcher.cc
@@ -0,0 +1,271 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <string>
+#include <vector>
+
+#include "lite/core/mir/dot.h"
+#include "lite/core/mir/xpu_pattern_matcher.h"
+#include "lite/core/op_lite.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+void XPUPatternMatcher::operator()(SSAGraph *graph,
+                                   XPUPatternMatcher::handle_t handler) {
+  if (!MarkPMNodesInGraph(graph)) {
+    return;
+  }
+
+  auto subgraphs = DetectPatterns();
+  UniquePatterns(&subgraphs);
+  RemoveOverlappedMatch(&subgraphs);
+  ValidateByNodeRole(&subgraphs);
+
+  if (subgraphs.empty()) return;
+  LOG(INFO) << "detected " << subgraphs.size() << " subgraph";
+  int id = 0;
+  for (auto &g : subgraphs) {
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
+    handler(g, graph);
+  }
+}
+
+bool XPUPatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) {
+  VLOG(3) << "mark pmnodes in graph";
+  if (graph->nodes().empty()) return false;
+  for (auto &node : graph->mutable_nodes()) {
+    for (const auto &pmnode : pattern_.nodes()) {
+      if (pmnode->Tell(&node)) {
+        pmnodes2nodes_[pmnode.get()].insert(&node);
+      }
+    }
+  }
+  // Check to early stop if some PMNode can't find matched Node.
+  for (auto &pmnode : pattern_.nodes()) {
+    if (!pmnodes2nodes_.count(pmnode.get())) {
+      VLOG(4) << pmnode->name() << " can't find matched Node, early stop";
+      // return false;
+    }
+  }
+  VLOG(3) << pmnodes2nodes_.size() << " nodes marked";
+
+  return !pmnodes2nodes_.empty();
+}
+
+// The intermediate Nodes can only link to the nodes inside the pattern, or this
+// subgraph will be droped.
+void XPUPatternMatcher::ValidateByNodeRole(
+    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
+  subgraphs->erase(
+      std::remove_if(subgraphs->begin(),
+                     subgraphs->end(),
+                     [](const XPUPatternMatcher::subgraph_t &subgraph) -> bool {
+                       // Collect the inlinks and outlinks.
+                       std::unordered_set<Node *> ios;
+                       for (auto &item : subgraph) {
+                         ios.insert(item.second);
+                       }
+                       for (auto &item : subgraph) {
+                         if (item.first->IsIntermediate()) {
+                           for (auto *x : item.second->outlinks) {
+                             if (!ios.count(x)) {
+                               return true;
+                             }
+                           }
+                         }
+                       }
+                       return false;
+                     }),
+      subgraphs->end());
+
+  for (auto &subgraph : *subgraphs) {
+    std::unordered_set<Node *> ios;
+    for (auto &item : subgraph) {
+      ios.insert(item.second);
+    }
+    extra_input_vars_.emplace_back();
+    for (auto &item : subgraph) {
+      for (auto *x : item.second->inlinks) {
+        if (x->IsArg() && ios.count(x) == 0) {
+          // extra weight var
+          extra_input_vars_.back().push_back(x);
+        }
+      }
+    }
+  }
+}
+
+struct HitGroup {
+  std::unordered_map<PMNode *, Node *> roles;
+
+  bool Match(Node *node, PMNode *pat) {
+    if (nodes_.count(node)) {
+      if (roles.count(pat) && roles[pat] == node) return true;
+      return false;
+    } else {
+      if (roles.count(pat) && roles[pat] != node) return false;
+      return true;
+    }
+  }
+
+  void Register(Node *node, PMNode *pat) {
+    roles[pat] = node;
+    nodes_.insert(node);
+  }
+
+ private:
+  std::unordered_set<Node *> nodes_;
+};
+
+// Tell whether Node a links to b.
+bool IsNodesLink(Node *a, Node *b) {
+  for (auto *node : a->outlinks) {
+    if (b == node) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<PatternMatcher::subgraph_t> XPUPatternMatcher::DetectPatterns() {
+  // Init empty subgraphs.
+  std::vector<PatternMatcher::subgraph_t> result;
+  std::vector<HitGroup> init_groups;
+  std::array<std::vector<HitGroup>, 2> bi_records;
+  auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
+                                               : pattern_.edges().front().first;
+  if (!pmnodes2nodes_.count(first_pnode)) return result;
+  for (auto *node : pmnodes2nodes_[first_pnode]) {
+    HitGroup group;
+    group.roles[first_pnode] = node;
+    init_groups.emplace_back(group);
+  }
+
+  int step = 0;
+  bi_records[0] = std::move(init_groups);
+
+  // Extend a PMNode to subgraphs by deducing the connection relations defined
+  // in edges of PMNodes.
+  for (const auto &edge : pattern_.edges()) {
+    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
+    // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
+    // Each role has two PMNodes, which indicates two roles.
+    // Detect two Nodes that can match these two roles and they are connected.
+    auto &pre_groups = bi_records[step % 2];
+    auto &cur_groups = bi_records[1 - (step++ % 2)];
+    cur_groups.clear();
+    if (pre_groups.empty()) break;
+    // source -> target
+    for (Node *source : pmnodes2nodes_[edge.first]) {
+      for (Node *target : pmnodes2nodes_[edge.second]) {
+        // TODO(Superjomn) add some prune strategies.
+        for (const auto &group : pre_groups) {
+          if (IsNodesLink(source, target)) {
+            HitGroup new_group = group;
+            bool flag = new_group.Match(source, edge.first) &&
+                        new_group.Match(target, edge.second);
+            if (flag) {
+              new_group.Register(source, edge.first);
+              new_group.Register(target, edge.second);
+              cur_groups.push_back(new_group);
+              // TODO(Superjomn) need to unique
+            }
+          }
+        }
+      }
+    }
+    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
+  }
+
+  for (auto &group : bi_records[step % 2]) {
+    XPUPatternMatcher::subgraph_t subgraph;
+    for (auto &role : group.roles) {
+      subgraph.emplace(role.first, role.second);
+    }
+    result.emplace_back(subgraph);
+  }
+  return result;
+}
+
+struct GraphItemLessThan {
+  bool operator()(const std::pair<PMNode *, Node *> &a,
+                  const std::pair<PMNode *, Node *> &b) {
+    if (a.first != b.first) {
+      return a.first < b.first;
+    } else {
+      return a.second < b.second;
+    }
+  }
+};
+
+// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
+// see https://github.com/PaddlePaddle/Paddle/issues/13550
+void XPUPatternMatcher::UniquePatterns(
+    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
+  if (subgraphs->empty()) return;
+  std::vector<PatternMatcher::subgraph_t> result;
+
+  std::unordered_set<size_t> set;
+  std::hash<std::string> hasher;
+  for (auto &g : *subgraphs) {
+    // Sort the items in the sub-graph, and transform to a string key.
+    std::vector<std::pair<PMNode *, Node *>> sorted_keys(g.begin(), g.end());
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
+    STL::stringstream ss;
+    for (auto &item : sorted_keys) {
+      ss << reinterpret_cast<size_t>(item.first) << ":"
+         << reinterpret_cast<size_t>(item.second);
+    }
+    auto key = hasher(ss.str());
+    if (!set.count(key)) {
+      result.emplace_back(g);
+      set.insert(key);
+    }
+  }
+  *subgraphs = result;
+}
+
+void XPUPatternMatcher::RemoveOverlappedMatch(
+    std::vector<subgraph_t> *subgraphs) {
+  std::vector<subgraph_t> result;
+  std::unordered_set<Node *> node_set;
+
+  for (const auto &subgraph : *subgraphs) {
+    bool valid = true;
+    for (auto &item : subgraph) {
+      if (item.first->IsIntermediate() && node_set.count(item.second)) {
+        valid = false;
+        break;
+      }
+    }
+    if (valid) {
+      for (auto &item : subgraph) {
+        node_set.insert(item.second);
+      }
+      result.push_back(subgraph);
+    }
+  }
+  *subgraphs = result;
+}
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/xpu_pattern_matcher.h b/lite/core/mir/xpu_pattern_matcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ac03718f32a859ff6888e63e57fd4098e435e06
--- /dev/null
+++ b/lite/core/mir/xpu_pattern_matcher.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pattern_matcher.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+/*
+ * PatternMatcher helps to detect the specific patterns in the graph.
+ * Input a pattern, output a list of the matched subgraphs/nodes.
+ * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
+ *
+ * The algorithm has three phases:
+ *   1. Mark the nodes that match the defined PMNodes in a PMPattern,
+ *   2. Extend a PMNode to subgraphs by deducing the connection relation defined
+ *      in PAPattern(the edges),
+ *   3. Get the filtered subgraphs and treat them with a pre-defined handler.
+ *
+ * Usage:
+ *    // Create a matcher
+ *    PatternMatcher matcher;
+ *    // Define the matcher's pattern, by adding PMNode and define the edges.
+ *    auto* node0 = matcher.mutable_pattern().AddNode(...)
+ *    auto* node1 = matcher.mutable_pattern().AddNode(...)
+ *    node0->teller = some lambda.
+ *    node1->teller = some lambda.
+ *    matcher.mutable_pattern().AddEdge(node0, node1);
+ *    // Create an handler, to define the behavior of treating the filtered
+ *    // subgraphs that comply with the patterns.
+ *    PatternMatcher::handle_t handler = some labmda
+ *    // Execute the matcher.
+ *    matcher(&graph, handler);
+ */
+struct XPUPatternMatcher {
+  using subgraph_t = std::unordered_map<PMNode*, Node*>;
+
+  // Operate on the detected pattern.
+  using handle_t =
+      std::function<void(const subgraph_t& /*hitted pattern*/, SSAGraph*)>;
+
+  void operator()(SSAGraph* graph, handle_t handler);
+
+  const PMPattern& pattern() const { return pattern_; }
+  PMPattern* mutable_pattern() { return &pattern_; }
+
+  // Mark the nodes that fits the pattern.
+  bool MarkPMNodesInGraph(SSAGraph* graph);
+
+  // Detect all the pattern and output the hit records.
+  std::vector<subgraph_t> DetectPatterns();
+
+  // Remove duplicate patterns.
+  void UniquePatterns(std::vector<subgraph_t>* subgraphs);
+
+  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  // The intermediate PMNodes will be removed, so can't shared by multiple
+  // patterns.
+  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
+
+  // Validate whether the intermediate nodes are linked by external nodes.
+  void ValidateByNodeRole(std::vector<subgraph_t>* subgraphs);
+
+  using hit_rcd_t =
+      std::pair<Node* /*node in graph*/, PMNode* /*node in pattern*/>;
+  PMPattern pattern_;
+  std::unordered_map<const PMNode*, std::unordered_set<Node*>> pmnodes2nodes_;
+  std::vector<std::vector<Node*>> extra_input_vars_;
+};
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/xpu_pattern_matcher_high_api.cc b/lite/core/mir/xpu_pattern_matcher_high_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ffc496d1593d15f02d82e824c06443e7b3e01c9
--- /dev/null
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include <set>
+#include <unordered_set>
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+void XPUFuseBase::PerformPatternMatcher(SSAGraph *graph) {
+  VLOG(4) << "\n" << matcher_.pattern().DotString();
+  // Get subgraphs and record the mir::Node pointers for each PMNode.
+  auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) {
+    // get all the reigistered nodes.
+    key2nodes_.emplace_back();
+    for (auto &item : nodes_) {
+      key2nodes_.back()[item.first] = subgraph.at(item.second);
+    }
+  };
+
+  matcher_(graph, handler);
+}
+
+void XPUFuseBase::DeleteInterNodes(SSAGraph *graph) {
+  std::set<std::string> keys;
+  for (auto &node : nodes_) {
+    if (node.second->IsIntermediate()) {
+      keys.insert(node.first);
+    }
+  }
+
+  VLOG(4) << "keys: " << key2nodes_.size();
+  std::unordered_set<const Node *> nodes2rm;
+  for (auto &matched : key2nodes_) {
+    for (const auto &key : keys) {
+      nodes2rm.insert(matched.at(key));
+    }
+  }
+
+  VLOG(3) << "clean nodes " << nodes2rm.size();
+  GraphSafeRemoveNodes(graph, nodes2rm);
+}
+
+PMNode *XPUFuseBase::GetOrCreateNode(const std::string &key) {
+  auto it = nodes_.find(key);
+  if (it != nodes_.end()) {
+    return it->second;
+  }
+  nodes_.emplace(key,
+                 matcher_.mutable_pattern()->NewNode(patterns::UniqueKey(key)));
+  it = nodes_.find(key);
+  return it->second;
+}
+
+PMNode *XPUFuseBase::OpNode(const std::string &key,
+                            const std::string &op_type) {
+  GetOrCreateNode(key)->set_op_type(op_type);
+  GetOrCreateNode(key)->AsOp(op_type);
+  return GetOrCreateNode(key);
+}
+
+PMNode *XPUFuseBase::VarNode(const std::string &key) {
+  GetOrCreateNode(key)->AsVar();
+  return GetOrCreateNode(key);
+}
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/xpu_pattern_matcher_high_api.h b/lite/core/mir/xpu_pattern_matcher_high_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..3302bcb6137f16afcf82269af91c8a13558da2b9
--- /dev/null
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+#include "lite/core/mir/xpu_pattern_matcher.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+class XPUFuseBase {
+ public:
+  using key2nodes_t = std::map<std::string, Node*>;
+
+  virtual ~XPUFuseBase() = default;
+
+  void operator()(SSAGraph* graph) {
+    BuildPattern();
+    PerformPatternMatcher(graph);
+
+    for (size_t i = 0; i < key2nodes_.size(); ++i) {
+      InsertNewNode(graph, key2nodes_[i], matcher_.extra_input_vars_[i]);
+    }
+
+    DeleteInterNodes(graph);
+  }
+
+  // Build a PMPattern using PMNode.
+  virtual void BuildPattern() = 0;
+
+  // Generate an operator desc with a matched subgraph.
+  virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
+    return cpp::OpDesc();
+  }
+
+  PMNode* OpNode(const std::string& key) {
+    return GetOrCreateNode(key)->assert_is_op();
+  }
+
+  PMNode* OpNode(const std::string& key, const std::string& op_type);
+
+  PMNode* VarNode(const std::string& key);
+
+ protected:
+  virtual void InsertNewNode(SSAGraph* graph,
+                             const key2nodes_t& matched,
+                             const std::vector<Node*>& extra_input_vars) = 0;
+
+  void PerformPatternMatcher(SSAGraph* graph);
+
+  // Delete nodes that are marked as Intermediate
+  void DeleteInterNodes(SSAGraph* graph);
+
+  PMNode* GetOrCreateNode(const std::string& key);
+
+ protected:
+  XPUPatternMatcher matcher_;
+  std::map<std::string, PMNode*> nodes_;
+  std::vector<key2nodes_t> key2nodes_;
+};
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc
index c76e369466a9b998b2ad6fde67b97117649fddc0..f8a706179374a0c86e28cf9a3638f5df2c932540 100644
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -22,6 +22,61 @@
 namespace paddle {
 namespace lite {
 
+bool OpLite::InferShape() {
+  // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_
+  // InferShapeByMemoryInternal will be applied.
+  if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) {
+    return this->InferShapeWithCache();
+  } else {
+    // otherwise, InferShapeImpl is applied directly.
+    return this->InferShapeImpl();
+  }
+}
+bool OpLite::InferShapeWithCache() {
+  // 1. Get vector of current input tensors
+  auto *current_inputs = param_.input_tensor_ptrs();
+  // 2. Get hash value of current inputs shape and lod
+  size_t new_hash = 0;
+  for (auto iter = current_inputs->begin(); iter != current_inputs->end();
+       iter++) {
+    // combined dims value into new_hash value.
+    auto &element_dims = (*iter)->dims();
+    for (int i = 0; i < element_dims.size(); i++) {
+      new_hash =
+          lite::hash_combine(new_hash, static_cast<int>(element_dims[i]));
+    }
+    // combine lod value into new_hash valud.
+    auto &emement_lods = (*iter)->lod();
+    for (auto lod_iter = emement_lods.begin(); lod_iter != emement_lods.end();
+         lod_iter++) {
+      for (int i = 0; i < lod_iter->size(); i++) {
+        new_hash =
+            lite::hash_combine(new_hash, static_cast<int>(lod_iter->at(i)));
+      }
+    }
+  }
+  // 3. infer shapes of output tensors
+  if (new_hash == io_shape_lod_hash_ && new_hash != 0) {
+    // if current hash value is consistent with io_shape_lod_hash_,
+    // previous outputs shape and lod are reused.
+    auto *current_outputs = param_.output_tensor_ptrs();
+    for (int i = 0; i < current_outputs->size(); i++) {
+      current_outputs->at(i)->Resize(last_output_shapes[i]);
+      current_outputs->at(i)->set_lod(last_output_lods[i]);
+    }
+  } else {
+    // otherwise, current hash value is changed, InferShapeImpl will apply.
+    io_shape_lod_hash_ = new_hash;
+    this->InferShapeImpl();
+    auto *current_outputs = param_.output_tensor_ptrs();
+    for (int i = 0; i < current_outputs->size(); i++) {
+      last_output_shapes[i] = current_outputs->at(i)->dims();
+      last_output_lods[i] = current_outputs->at(i)->lod();
+    }
+  }
+  return true;
+}
+
 std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
     const std::vector<Place> &places, const std::string &kernel_type) {
   std::vector<std::unique_ptr<KernelBase>> kernels;
@@ -102,5 +157,33 @@ Tensor *OpLite::GetMutableTensor(lite::Scope *scope,
   return var->GetMutable<lite::Tensor>();
 }
 
+void OpLite::AttachInput(const cpp::OpDesc &op_desc,
+                         lite::Scope *scope,
+                         const std::string &input_name,
+                         bool is_dispensable,
+                         lite::Tensor **input_var) {
+  bool is_have_input =
+      op_desc.HasInput(input_name) && op_desc.Input(input_name).size() > 0;
+  CHECK(is_dispensable || is_have_input);
+  if (is_have_input) {
+    std::string input_var_name = op_desc.Input(input_name).front();
+    *input_var = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  }
+}
+
+void OpLite::AttachOutput(const cpp::OpDesc &op_desc,
+                          lite::Scope *scope,
+                          const std::string &output_name,
+                          bool is_dispensable,
+                          lite::Tensor **output_var) {
+  bool is_have_output =
+      op_desc.HasOutput(output_name) && op_desc.Output(output_name).size() > 0;
+  CHECK(is_dispensable || is_have_output);
+  if (is_have_output) {
+    std::string output_var_name = op_desc.Output(output_name).front();
+    *output_var = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h
index 77d8091b4b16cfbce2efc3d549f916a9136c61ab..428b188c468ded790e74c9cc4f5da5c7efe2fd00 100644
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <functional>
 #include <list>
 #include <map>
 #include <memory>
@@ -24,6 +25,7 @@
 #include "lite/core/kernel.h"
 #include "lite/core/scope.h"
 #include "lite/model_parser/cpp/op_desc.h"
+#include "lite/operators/op_params.h"
 
 namespace paddle {
 namespace lite {
@@ -64,8 +66,8 @@ class OpLite : public Registry {
   // Check the shape.
   virtual bool CheckShape() const { return true; }
   // Inference the outputs' shape.
-  virtual bool InferShape() const { return true; }
-  virtual bool SmartInferShape() { return this->InferShape(); }
+  virtual bool InferShapeImpl() const { return true; }
+  virtual bool InferShape();
   // Run this operator.
   virtual bool Run();
   // Indicate whether the Op runs only once or not
@@ -103,6 +105,20 @@ class OpLite : public Registry {
     return kernel_.get();
   }
 
+  // Attach input variable from scope by op_desc and input name
+  void AttachInput(const cpp::OpDesc &op_desc,
+                   lite::Scope *scope,
+                   const std::string &input_name,
+                   bool is_dispensable,
+                   lite::Tensor **input_var);
+
+  // Attach output variable from scope by op_desc and output name
+  void AttachOutput(const cpp::OpDesc &op_desc,
+                    lite::Scope *scope,
+                    const std::string &output_name,
+                    bool is_dispensable,
+                    lite::Tensor **output_var);
+
   virtual ~OpLite() = default;
 
  protected:
@@ -151,10 +167,16 @@ class OpLite : public Registry {
   std::vector<Place> valid_places_;
   Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
   std::unique_ptr<OpInfo> op_info_;
-  std::vector<DDimLite> last_input_shapes;
-  std::vector<DDimLite> last_output_shapes;
-  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods;
-  std::vector<std::vector<std::vector<uint64_t>>> last_input_lods;
+
+  std::vector<DDimLite> last_output_shapes{};
+  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
+  size_t io_shape_lod_hash_{};
+  mutable operators::ParamBase param_;
+
+ private:
+  // Infer Shape according to memory, if current input shapes are consistent
+  // with that of previous inputs, output shapes of last time will be reused.
+  bool InferShapeWithCache();
 };
 
 /*
@@ -217,6 +239,32 @@ class OpInfo : public cpp::OpDesc {
     return false;
   }
 
+  // For the input variable name, find the index of the corresponding
+  // input argname
+  bool GetInputIndex(const std::string &value_name, int *out) const {
+    for (auto &item : inputs_) {
+      auto it = std::find(item.second.begin(), item.second.end(), value_name);
+      if (it != item.second.end()) {
+        *out = it - item.second.begin();
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // For the output variable name, find the index of the corresponding
+  // output argname
+  bool GetOutputIndex(const std::string &value_name, int *out) const {
+    for (auto &item : outputs_) {
+      auto it = std::find(item.second.begin(), item.second.end(), value_name);
+      if (it != item.second.end()) {
+        *out = it - item.second.begin();
+        return true;
+      }
+    }
+    return false;
+  }
+
   void UpdateAllInputs(const std::string &from, const std::string &to) {
     for (auto &item : inputs_) {
       for (auto &var : item.second) {
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
index fe1dff3c99c1d2413888e78c89c999caea0ab030..0c8d42f4e2dc0b0a32d352ed9b460e1a0b7bfb90 100644
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -110,6 +110,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
     case TARGET(kMLU): {
       CREATE_KERNEL(kMLU);
     } break;
+    case TARGET(kRKNPU): {
+      CREATE_KERNEL(kRKNPU);
+    } break;
     default:
       CHECK(false) << "not supported kernel target " << TargetToStr(target);
   }
@@ -151,14 +154,30 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kMLU, kInt16, kNHWC);
   INIT_FOR(kMLU, kInt16, kNCHW);
 
-  INIT_FOR(kHost, kFloat, kNCHW);
   INIT_FOR(kHost, kAny, kNCHW);
-  INIT_FOR(kHost, kFloat, kNHWC);
-  INIT_FOR(kHost, kFloat, kAny);
-  INIT_FOR(kHost, kAny, kNHWC);
-  INIT_FOR(kHost, kAny, kAny);
   INIT_FOR(kHost, kAny, kNHWC);
   INIT_FOR(kHost, kAny, kAny);
+  INIT_FOR(kHost, kBool, kNCHW);
+  INIT_FOR(kHost, kBool, kNHWC);
+  INIT_FOR(kHost, kBool, kAny);
+  INIT_FOR(kHost, kFloat, kNCHW);
+  INIT_FOR(kHost, kFloat, kNHWC);
+  INIT_FOR(kHost, kFloat, kAny);
+  INIT_FOR(kHost, kFP16, kNCHW);
+  INIT_FOR(kHost, kFP16, kNHWC);
+  INIT_FOR(kHost, kFP16, kAny);
+  INIT_FOR(kHost, kInt8, kNCHW);
+  INIT_FOR(kHost, kInt8, kNHWC);
+  INIT_FOR(kHost, kInt8, kAny);
+  INIT_FOR(kHost, kInt16, kNCHW);
+  INIT_FOR(kHost, kInt16, kNHWC);
+  INIT_FOR(kHost, kInt16, kAny);
+  INIT_FOR(kHost, kInt32, kNCHW);
+  INIT_FOR(kHost, kInt32, kNHWC);
+  INIT_FOR(kHost, kInt32, kAny);
+  INIT_FOR(kHost, kInt64, kNCHW);
+  INIT_FOR(kHost, kInt64, kNHWC);
+  INIT_FOR(kHost, kInt64, kAny);
 
   INIT_FOR(kX86, kFloat, kNCHW);
   INIT_FOR(kX86, kAny, kNCHW);
@@ -216,6 +235,11 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kBM, kInt8, kNCHW);
   INIT_FOR(kBM, kAny, kNCHW);
   INIT_FOR(kBM, kAny, kAny);
+
+  INIT_FOR(kRKNPU, kFloat, kNCHW);
+  INIT_FOR(kRKNPU, kInt8, kNCHW);
+  INIT_FOR(kRKNPU, kAny, kNCHW);
+  INIT_FOR(kRKNPU, kAny, kAny);
 #undef INIT_FOR
 }
 
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index 3c41c1fd8af240401c3edf0343433f8d8d9c85db..65279b74c5149f1c73cb42d57b5f47f608f38de1 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -135,6 +135,12 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kHost),
                                       PRECISION(kAny),
                                       DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kHost),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW)> *,  //
               KernelRegistryForTarget<TARGET(kCUDA),
                                       PRECISION(kAny),
                                       DATALAYOUT(kAny)> *,  //
@@ -245,6 +251,16 @@ class KernelRegistry final {
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
 
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+
               KernelRegistryForTarget<TARGET(kFPGA),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW)> *,  //
@@ -429,32 +445,31 @@ class KernelRegistor : public lite::Registor<KernelType> {
 #define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \
   LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__)
 
-#define REGISTER_LITE_KERNEL(                                                  \
-    op_type__, target__, precision__, layout__, KernelClass, alias__)          \
-  static paddle::lite::KernelRegistor<TARGET(target__),                        \
-                                      PRECISION(precision__),                  \
-                                      DATALAYOUT(layout__),                    \
-                                      KernelClass>                             \
-      LITE_KERNEL_REGISTER_INSTANCE(                                           \
-          op_type__, target__, precision__, layout__, alias__)(#op_type__,     \
-                                                               #alias__);      \
-  static KernelClass LITE_KERNEL_INSTANCE(                                     \
-      op_type__, target__, precision__, layout__, alias__);                    \
-  int touch_##op_type__##target__##precision__##layout__##alias__() {          \
-    OpKernelInfoCollector::Global().AddKernel2path(                            \
-        #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__,  \
-        __FILE__);                                                             \
-    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__)  \
-        .Touch();                                                              \
-    return 0;                                                                  \
-  }                                                                            \
-  static bool LITE_KERNEL_PARAM_INSTANCE(                                      \
-      op_type__, target__, precision__, layout__, alias__)                     \
-      __attribute__((unused)) =                                                \
-          paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),       \
-                                                       PRECISION(precision__), \
-                                                       DATALAYOUT(layout__)>(  \
-              #op_type__ "/" #alias__)
+#define REGISTER_LITE_KERNEL(                                                 \
+    op_type__, target__, precision__, layout__, KernelClass, alias__)         \
+  static paddle::lite::KernelRegistor<TARGET(target__),                       \
+                                      PRECISION(precision__),                 \
+                                      DATALAYOUT(layout__),                   \
+                                      KernelClass>                            \
+      LITE_KERNEL_REGISTER_INSTANCE(                                          \
+          op_type__, target__, precision__, layout__, alias__)(#op_type__,    \
+                                                               #alias__);     \
+  static KernelClass LITE_KERNEL_INSTANCE(                                    \
+      op_type__, target__, precision__, layout__, alias__);                   \
+  int touch_##op_type__##target__##precision__##layout__##alias__() {         \
+    OpKernelInfoCollector::Global().AddKernel2path(                           \
+        #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \
+        __FILE__);                                                            \
+    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \
+        .Touch();                                                             \
+    return 0;                                                                 \
+  }                                                                           \
+  static bool LITE_KERNEL_PARAM_INSTANCE(                                     \
+      op_type__, target__, precision__, layout__, alias__) UNUSED =           \
+      paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),          \
+                                                   PRECISION(precision__),    \
+                                                   DATALAYOUT(layout__)>(     \
+          #op_type__ "/" #alias__)
 
 #define LITE_KERNEL_INSTANCE(                            \
     op_type__, target__, precision__, layout__, alias__) \
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 4348f9eeaa592d263698eb164b29db8126a17698..2fb27996823cb7f9fdb842b668ca93da0941cdb1 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -78,6 +78,8 @@ class Optimizer {
     (defined LITE_WITH_ARM)
            "lite_elementwise_add_activation_fuse_pass",  //
 #endif
+           "__xpu__resnet_fuse_pass",
+           "__xpu__multi_encoder_fuse_pass",
            "quantized_op_attributes_inference_pass",  // Only for fully
                                                       // quantized model, infer
                                                       // the output scale and
@@ -87,6 +89,7 @@ class Optimizer {
            "npu_subgraph_pass",
            "xpu_subgraph_pass",
            "bm_subgraph_pass",
+           "rknpu_subgraph_pass",
            "static_kernel_pick_pass",        // pick original kernel from graph
            "variable_place_inference_pass",  // inference arg/var's
            // info(target/precision/layout/device)
@@ -128,7 +131,21 @@ class Optimizer {
            "memory_optimize_pass"}};
 
       if (passes.size() == 1) {
-        passes_local.push_back(passes[0]);
+        // multi_stream_analysis_pass must be in the front of
+        // runtime_context_assign_pass
+        const std::string msa_pass{"multi_stream_analysis_pass"};
+        const std::string depend_pass{"runtime_context_assign_pass"};
+        if (passes[0] == msa_pass) {
+          auto iter =
+              std::find(passes_local.begin(), passes_local.end(), depend_pass);
+          if (iter != passes_local.end()) {
+            passes_local.insert(iter, msa_pass);
+          } else {
+            CHECK(false) << "Not find " << depend_pass;
+          }
+        } else {
+          passes_local.push_back(passes[0]);
+        }
       }
       RunPasses(passes_local);
     } else {
diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h
index 39213a33cebd05d9cfa50d82cdfb09ad3f7ad637..ee581bf5e126f07fcdb1edeb9ab5b570df0c2ade 100644
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -18,6 +18,7 @@
  * of each kernel.
  */
 #pragma once
+#include <cmath>
 #include <string>
 #include <vector>
 #include "lite/core/program.h"
@@ -177,6 +178,13 @@ class PrecisionProfiler {
           write_result_to_file&& write_tensorfile<int32_t>(in, name);
           return;
         }
+        case PRECISION(kInt64): {
+          auto ptr = in->data<int64_t>();
+          *mean = compute_mean<int64_t>(ptr, in->numel());
+          *std_dev = compute_standard_deviation<int64_t>(
+              ptr, in->numel(), true, *mean);
+          return;
+        }
         default:
           *mean = -333333333333;
           *std_dev = -33333333333;
diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc
index f4d0e3c0afbe1f9df4e381a502e1800a3d58ba68..3906cf0989a11c079323bdc8f256e6b5a5a33394 100644
--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
@@ -100,7 +100,8 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
      << " " << setw(12) << left << "Avg (ms)"
      << " " << setw(12) << left << "Min (ms)"
      << " " << setw(12) << left << "Max (ms)"
-     << " " << setw(12) << left << "Last (ms)" << std::endl;
+     << " " << setw(12) << left << "Last (ms)"
+     << " " << setw(12) << left << "Percent (%)" << std::endl;
   // Profile information.
   if (concise) {
     std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
@@ -117,7 +118,16 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
         summary.insert({unit.Character(), info});
       }
     }
+    // compute total time
+    float total = 0.0;
     for (const auto& item : summary) {
+      total += item.second.avg;
+    }
+    for (const auto& item : summary) {
+      float percent = 0;
+      if (total > 0) {
+        percent = 100 * (item.second.avg / total);
+      }
       // clang-format off
       ss << setw(25) << left << fixed << item.first.op_type             \
          << " " << setw(40) << left << fixed << item.first.kernel_name  \
@@ -125,12 +135,23 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
          << " " << setw(12) << left << fixed << item.second.avg         \
          << " " << setw(12) << left << fixed << item.second.min         \
          << " " << setw(12) << left << fixed << item.second.max         \
+         << " " << setw(12) << left << fixed << percent << "%"          \
          << " " << std::endl;
       // clang-format on
     }
   } else {
+    float total = 0.0;
     for (auto& unit : units_) {
       const auto& times = unit.Timer(type)->LapTimes();
+      total += times.Avg(w);
+    }
+    for (auto& unit : units_) {
+      const auto& times = unit.Timer(type)->LapTimes();
+      float run = times.Avg(w);
+      float percent = 0;
+      if (total > 0) {
+        percent = 100 * (run / total);
+      }
       // clang-format off
       ss << setw(25) << left << fixed << unit.Character().op_type            \
          << " " << setw(40) << left << fixed << unit.Character().kernel_name \
@@ -139,6 +160,7 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
          << " " << setw(12) << left << fixed << times.Min(w)                 \
          << " " << setw(12) << left << fixed << times.Max(w)                 \
          << " " << setw(12) << left << fixed << times.Last(w)                \
+          << " " << setw(12) << left << fixed << percent << "%"              \
          << std::endl;
       // clang-format on
     }
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 580389fbad54c0de8efd65ef78c9b69fd3e72893..1193e3c84f66b9d1dfb39d5dcc74265d212ab7ab 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -20,7 +20,7 @@
 #include "lite/operators/conditional_block_op.h"
 #include "lite/operators/subgraph_op.h"
 #include "lite/operators/while_op.h"
-#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
 #include "lite/core/profile/precision_profiler.h"
 #endif
 
@@ -136,34 +136,35 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 }
 
 void RuntimeProgram::Run() {
-#ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
   auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
   std::string precision_profiler_summary =
       inst_precision_profiler.GetSummaryHeader();
-#endif
 #endif
 
   for (auto& inst : instructions_) {
 #ifndef LITE_WITH_FPGA
     if (inst.is_feed_fetch_op()) continue;
+#endif
+#ifdef LITE_WITH_CUDA
+    if (inst.need_sync()) {
+      inst.Sync();
+    }
 #endif
     inst.Run();
-#ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
 #ifndef LITE_WITH_FPGA
     precision_profiler_summary +=
         inst_precision_profiler.GetInstPrecision(&inst);
 #endif
 #endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
   }
 #ifdef LITE_WITH_PROFILE
   LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
+#endif
 #ifdef LITE_WITH_PRECISION_PROFILE
   LOG(INFO) << "\n" << precision_profiler_summary;
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
+#endif
 }
 
 void Program::Build(const cpp::ProgramDesc& prog) {
@@ -286,8 +287,7 @@ void Instruction::Run() {
     return;
   }
 
-  // op_->InferShape();
-  op_->SmartInferShape();
+  op_->InferShape();
   kernel_->Launch();
   has_run_ = true;
 }
diff --git a/lite/core/program.h b/lite/core/program.h
index c845a17c52c0c565e339a13e093f3e8f59e8d4a7..9d5fef7c0367d0e0fabf6ecff8b22e5e20a7bb57 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -108,6 +108,18 @@ struct Instruction {
 
   bool is_feed_fetch_op() const { return is_feed_fetch_op_; }
 
+#ifdef LITE_WITH_CUDA
+  bool need_sync() const {
+    if (kernel_->target() == TargetType::kCUDA) {
+      return kernel_->mutable_context()->As<CUDAContext>().need_sync();
+    } else {
+      // the io_copy kernel has synced, so cpu kernels don't need sync..
+      return false;
+    }
+  }
+  void Sync() const { kernel_->mutable_context()->As<CUDAContext>().Sync(); }
+#endif
+
 #ifdef LITE_WITH_PROFILE
   void set_profiler(profile::Profiler* profiler) {
     profiler_ = profiler;
diff --git a/lite/core/types.cc b/lite/core/types.cc
index 4ea383333d519ac2c481dce459ca49124a64df32..a19c5ed0a33986237ce03213875929d34a2fb363 100644
--- a/lite/core/types.cc
+++ b/lite/core/types.cc
@@ -67,31 +67,31 @@ STL::ostream& operator<<(STL::ostream& os, const KernelPickFactor& k) {
 
 template <>
 Type StdTypeToRepr<int32_t>() {
-  return Type::_int32;
+  return Type::INT32;
 }
 template <>
 Type StdTypeToRepr<int64_t>() {
-  return Type::_int64;
+  return Type::INT64;
 }
 template <>
 Type StdTypeToRepr<float>() {
-  return Type::_float32;
+  return Type::FLOAT32;
 }
 template <>
 Type StdTypeToRepr<double>() {
-  return Type::_float64;
+  return Type::Float64;
 }
 template <>
 Type StdTypeToRepr<std::vector<char>>() {
-  return Type::_char_list;
+  return Type::CHARLIST;
 }
 template <>
 Type StdTypeToRepr<std::string>() {
-  return Type::_string;
+  return Type::STRING;
 }
 template <>
 Type StdTypeToRepr<bool>() {
-  return Type::_bool;
+  return Type::BOOL;
 }
 
 }  // namespace core
diff --git a/lite/core/types.h b/lite/core/types.h
index 8f154f9dd509d3627750ecbf301923a2296252d1..66dc44746a7496d9805e8cc2b6bf2df89b33ddbf 100644
--- a/lite/core/types.h
+++ b/lite/core/types.h
@@ -29,23 +29,23 @@ namespace core {
  */
 // TODO(Superjomn) unify all the type representation across the lite framework.
 enum class Type {
-  _unk = -1,
-  // primary types
-  _int32,
-  _int64,
-  _float32,
-  _float64,
-  _bool,
-  _string,
+  UNK = -1,
+  // primary typesINT32,
+  INT32,
+  INT64,
+  FLOAT32,
+  Float64,
+  BOOL,
+  STRING,
   // primary list type
-  _char_list,
+  CHARLIST,
   // list types
-  _list,
+  LIST,
   // enum type
-  _enum,
-  _float16,
+  ENUM,
+  FLOAT16,
   // number of types
-  __num__,
+  NUM,
 };
 
 enum class FluidType {
@@ -81,7 +81,7 @@ enum class FluidType {
 
 template <typename T>
 Type StdTypeToRepr() {
-  return Type::_unk;
+  return Type::UNK;
 }
 template <>
 Type StdTypeToRepr<int32_t>();
@@ -92,6 +92,8 @@ Type StdTypeToRepr<float>();
 template <>
 Type StdTypeToRepr<bool>();
 template <>
+Type StdTypeToRepr<double>();
+template <>
 Type StdTypeToRepr<std::vector<char>>();
 template <>
 Type StdTypeToRepr<std::string>();
diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
index 0c9da1a76422edae45dfeec5d38556a5e2322a85..2a819883fa316bd1898c063912800b57804218db 100644
--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -18,6 +18,11 @@
 #include "paddle_api.h"         // NOLINT
 #include "paddle_use_passes.h"  // NOLINT
 
+#if defined(_WIN32)
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#endif
+
 using namespace paddle::lite_api;  // NOLINT
 
 DEFINE_string(model_dir, "", "Model dir path.");
diff --git a/lite/demo/cxx/train_demo/README.md b/lite/demo/cxx/train_demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..56f4513d45676a1deb51bfb93096db156ddd0449
--- /dev/null
+++ b/lite/demo/cxx/train_demo/README.md
@@ -0,0 +1,191 @@
+
+# Introduction
+  我们都知道，PaddleLite可以做移动端预测，事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子，这一例子对应的任务是“波士顿房价预测”，又称作“fit-a-line”。
+  
+  你可以通过book库中的
+[文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html)
+和
+[源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)
+进一步了解“波士顿房价预测”这一任务的定义及其建模过程，
+其使用线性回归（Linear Regression）
+模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。
+
+注：这是一篇使用C++ API做模型训练的教程，其他API暂时不支持训练功能。
+
+# Requirements
+
+- 一部安卓手机，用于运行训练程序
+- 装了Paddle (version: 1.7.0) 的python
+
+# Quick start
+
+## Step1 build paddle-lite
+
+请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例，其具体的命令为：
+
+```shell
+## 配置环境
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz --no-check-certificate
+tar xzf cmake-3.10.3-Linux-x86_64.tar.gz
+export PATH=${PWD}'/cmake-3.10.3-Linux-x86_64/bin':$PATH
+
+wget https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
+unzip android-ndk-r17c-linux-x86_64.zip
+export NDK_ROOT=/opt/android-ndk-r17c
+
+## 编译
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv7 \
+  --build_extra=ON \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_train=ON full_publish
+```
+
+产物:
+
+```shell
+Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so
+```
+
+## Step2 编译lr_trainer
+
+```shell
+cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/
+sh run_build.sh /path/to/your/Paddle-Lite/build.lite.android.armv7.gcc/ /path/to/your/android-ndk-r17c
+```
+
+产物:
+```shell
+bin/
+`-- demo_trainer
+```
+
+## Step3 download model and run it!
+
+在你的笔记本电脑上，用usb连接到手机，开启开发者模式，在任意目录下执行：
+
+```shell
+local_path=/data/local/tmp/linear_regression
+adb shell "mkdir "${local_path}
+
+# download model and push to mobile
+wget http://paddle-tar.bj.bcebos.com/paddle-lite/lite_lr_model.tar.gz
+tar -zxvf lite_lr_model.tar.gz
+adb push lite_lr_model/housing.data ${local_path}
+adb push lite_lr_model/model_dir ${local_path}
+
+# push lib and executable file to moblie
+adb push libpaddle_full_api_shared.so ${local_path}
+adb push demo_trainer ${local_path}
+adb shell chmod +x ${local_path}/demo_trainer
+
+# run it!
+adb shell "export LD_LIBRARY_PATH="${local_path}" && export LIBRARY_PATH="${local_path}" && cd "${local_path}" && ./demo_trainer true"
+```
+
+期望结果：
+
+```
+sample 0: Loss: 564.317
+sample 1: Loss: 463.9
+sample 2: Loss: 1197.54
+sample 3: Loss: 1093.83
+sample 4: Loss: 1282.76
+sample 5: Loss: 792.097
+sample 6: Loss: 491.776
+sample 7: Loss: 698.496
+sample 8: Loss: 248.445
+sample 9: Loss: 325.135
+```
+
+# 更多细节
+上面提到的模型是直接下载得到的，如果你想自己生成，可以执行以下命令：
+
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite/lite/demo/cxx/train_demo/
+python train.py --save_model
+```
+
+产物：
+
+```shell
+model_dir/
+|-- fc_0.b_0
+|-- fc_0.w_0
+|-- learning_rate_0
+`-- __model__
+
+md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d
+```
+
+如果你想生成自己的模型用于训练，可以参考`train.py`中保存模型的方式。
+
+# 与Paddle训练结果做校对
+
+## 前10个Loss值
+
+为了验证paddle与lite的一致性，我们控制模型参数一致、数据一致、batch size = 1的情况下，训练10个batch， 记录了二者的loss值。
+
+python + paddle 命令:
+
+```shell
+  fluid train.py --num_steps=10 --batch_size=1
+```
+
+python + paddle 结果:
+
+```shell
+Train cost, Step 0, Cost 564.317017
+Train cost, Step 1, Cost 463.900238
+Train cost, Step 2, Cost 1197.537354
+Train cost, Step 3, Cost 1093.833008
+Train cost, Step 4, Cost 1282.760254
+Train cost, Step 5, Cost 792.097351
+Train cost, Step 6, Cost 491.775848
+Train cost, Step 7, Cost 698.496033
+Train cost, Step 8, Cost 248.444885
+Train cost, Step 9, Cost 325.135132
+```
+
+c++ 与 paddle-lite命令：
+```
+./demo_trainer true
+```
+
+c++ 与 paddle-lite结果：
+```
+sample 0: Loss: 564.317
+sample 1: Loss: 463.9
+sample 2: Loss: 1197.54
+sample 3: Loss: 1093.83
+sample 4: Loss: 1282.76
+sample 5: Loss: 792.097
+sample 6: Loss: 491.776
+sample 7: Loss: 698.496
+sample 8: Loss: 248.445
+sample 9: Loss: 325.135
+```
+
+## Loss 曲线
+
+控制训练时的batch size为20，每个epoch对训练数据做全局shuffle，训练100个epoch后，paddle和lite的loss曲线对比如下。
+
+![lr_loss](image/lr_loss.png)
+
+如果想复现上述效果，paddle+python的运行命令为：
+
+```
+git clone https://github.com/PaddlePaddle/book.git
+cd book/01.fit_a_line
+python train.py
+```
+
+lite + c++的运行命令为：
+```
+./demo_trainer false
+```
diff --git a/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt b/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b41808352a186e8ed434c0cf9364a9cae7d3928e
--- /dev/null
+++ b/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 2.8)
+set (CMAKE_CXX_STANDARD 11)
+
+# Project's name
+
+if(NOT DEFINED LITE_ROOT)
+  message(FATAL_ERROR "please set LITE_ROOT with
+                 -DLITE_ROOT=/path/to/your/build.lite.android.armv7.gcc/")
+endif()
+
+project(demo_trainer)
+# Set the output folder where your program will be created
+set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin)
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+
+# The following folder will be included
+include_directories("include")
+include_directories("${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/include")
+
+add_executable(demo_trainer ${PROJECT_SOURCE_DIR}/demo_trainer.cc ${PROJECT_SOURCE_DIR}/data_reader.cc)
+
+TARGET_LINK_LIBRARIES(demo_trainer
+"${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so")
diff --git a/lite/demo/cxx/train_demo/cplus_train/data_reader.cc b/lite/demo/cxx/train_demo/cplus_train/data_reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4546e2e5fecc17321e8126485022b4ac30876747
--- /dev/null
+++ b/lite/demo/cxx/train_demo/cplus_train/data_reader.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/data_reader.h"
+#include <limits>
+
+using std::string;
+using std::vector;
+
+int FEATURE_NUM = 13;
+float rate = 0.8;
+
+int get_samples(string line, vector<float>* feature, float* label) {
+  std::istringstream reader(line);
+  std::vector<float> numbers;
+  do {
+    // read as many numbers as possible.
+    for (float number; reader >> number;) {
+      numbers.push_back(number);
+    }
+    // consume and discard token from stream.
+    if (reader.fail()) {
+      reader.clear();
+      std::string token;
+      reader >> token;
+    }
+  } while (!reader.eof());
+
+  assert(numbers.size() == FEATURE_NUM + 1);
+  for (int i = 0; i < FEATURE_NUM; i++) {
+    feature->push_back(numbers[i]);
+  }
+  *label = numbers[FEATURE_NUM];
+  return 0;
+}
+
+int normalize(const vector<vector<float>>& origin_features,
+              vector<vector<float>>* features,
+              float rate) {
+  int inf = std::numeric_limits<int>::max();
+  vector<float> min_vec(FEATURE_NUM, static_cast<float>(inf));
+  vector<float> max_vec(FEATURE_NUM, -(static_cast<float>(inf)));
+  vector<float> sum_vec(FEATURE_NUM, 0);
+  vector<float> avg_vec(FEATURE_NUM, 0);
+
+  for (int i = 0; i < origin_features.size(); i++) {
+    for (int j = 0; j < FEATURE_NUM; j++) {
+      min_vec[j] = min(min_vec[j], origin_features[i][j]);
+      max_vec[j] = max(max_vec[j], origin_features[i][j]);
+      sum_vec[j] += origin_features[i][j];
+    }
+  }
+
+  for (int i = 0; i < FEATURE_NUM; i++) {
+    avg_vec[i] = sum_vec[i] / origin_features.size();
+  }
+
+  for (int i = 0; i < origin_features.size() * rate - 1; i++) {
+    vector<float> feat;
+    for (int j = 0; j < FEATURE_NUM; j++) {
+      feat.push_back((origin_features[i][j] - avg_vec[j]) /
+                     (max_vec[j] - min_vec[j]));
+    }
+    features->push_back(feat);
+  }
+}
+
+int read_samples(const string fname,
+                 vector<vector<float>>* features,
+                 vector<float>* labels) {
+  fstream fin;
+  fin.open(fname);
+  if (!static_cast<bool>(fin)) {
+    return 1;
+  }
+  vector<vector<float>> origin_features;
+  vector<string> lines;
+  string line;
+  while (getline(fin, line)) {
+    lines.push_back(line);
+  }
+  fin.close();
+
+  for (int i = 0; i < lines.size(); i++) {
+    vector<float> feat;
+    float lbl = 0;
+    get_samples(lines[i], &feat, &lbl);
+    origin_features.push_back(feat);
+    if (i < lines.size() * rate - 1) {
+      labels->push_back(lbl);
+    }
+  }
+
+  cout << "finish read fata" << endl;
+  normalize(origin_features, features, rate);
+  assert(features->size() == labels->size());
+  return 0;
+}
diff --git a/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc b/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f035078fff35c4b2c0b41d0de84d2621c550d14e
--- /dev/null
+++ b/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "include/data_reader.h"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+class LRModel {
+ public:
+  void InitModel() {
+    // 1. Set CxxConfig
+    CxxConfig config;
+    config.set_model_dir("model_dir");
+    std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
+    config.set_valid_places(valid_places);
+    predictor_ = CreatePaddlePredictor<CxxConfig>(config);
+  }
+
+  float Predict(const vector<vector<float>>& features,
+                const vector<float>& labels) {
+    // Create Tensor
+    assert(features.size() == labels.size());
+    int batch_size = features.size();
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor_->GetInput(0)));
+    input_tensor->Resize(shape_t({batch_size, FEATURE_NUM}));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int i = 0; i < batch_size; i++) {
+      for (int j = 0; j < FEATURE_NUM; j++) {
+        data[FEATURE_NUM * i + j] = features[i][j];
+      }
+    }
+    std::unique_ptr<Tensor> y_tensor(std::move(predictor_->GetInput(1)));
+    y_tensor->Resize(shape_t({batch_size, 1}));
+    auto* y_data = y_tensor->mutable_data<float>();
+    for (int i = 0; i < batch_size; i++) {
+      y_data[i] = labels[i];
+    }
+    predictor_->Run();
+    std::unique_ptr<const Tensor> output_tensor(
+        std::move(predictor_->GetOutput(0)));
+    return output_tensor->data<float>()[0];
+  }
+
+ private:
+  std::shared_ptr<PaddlePredictor> predictor_;
+};
+
+int shuffle(vector<vector<float>>* features, vector<float>* labels) {
+  assert(features->size() == labels->size());
+  vector<int> index;
+  for (int i = 0; i < features->size(); i++) {
+    index.push_back(i);
+  }
+  random_shuffle(index.begin(), index.end());
+
+  vector<vector<float>> tmp_features;
+  vector<float> tmp_labels;
+
+  for (int i = 0; i < features->size(); i++) {
+    tmp_features.push_back((*features)[index[i]]);
+    tmp_labels.push_back((*labels)[index[i]]);
+  }
+
+  for (int i = 0; i < features->size(); i++) {
+    for (int j = 0; j < FEATURE_NUM; j++) {
+      (*features)[i][j] = tmp_features[i][j];
+    }
+    (*labels)[i] = tmp_labels[i];
+  }
+  return 0;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 2) {
+    cerr << "usage: ./demo_trainer is_small" << endl;
+    cerr << "       if is_small is true, the batch size is set to 1, " << endl;
+    cerr << "       and it will only runs for 10 steps." << endl;
+    return 1;
+  }
+  string is_small = argv[1];
+  vector<vector<float>> features;
+  vector<float> labels;
+  read_samples("housing.data", &features, &labels);
+  cout << "sample count: " << features.size() << " " << endl;
+
+  std::shared_ptr<LRModel> local_model(new LRModel());
+  local_model->InitModel();
+
+  if (is_small == "true") {
+    cout << "small mode" << endl;
+    for (int i; i < 10; i++) {
+      vector<vector<float>> batch_feature;
+      vector<float> batch_label;
+      batch_feature.push_back(features[i]);
+      batch_label.push_back(labels[i]);
+      auto loss = local_model->Predict(batch_feature, batch_label);
+      cout << "sample " << i << ": " << loss << endl;
+    }
+  } else if (is_small == "false") {
+    // shuffle
+    cout << "full model" << endl;
+    int epoch = 100;
+    int batch_size = 20;
+    int step = 0;
+    for (int i; i < epoch; i++) {
+      shuffle(&features, &labels);
+      for (int j = 0;
+           j < ceil(static_cast<float>(features.size()) / batch_size);
+           j++) {
+        int start_idx = j * batch_size;
+        int end_idx =
+            min((j + 1) * batch_size, static_cast<int>(features.size()));
+        auto batch_feature = vector<vector<float>>(features.begin() + start_idx,
+                                                   features.begin() + end_idx);
+        auto batch_label =
+            vector<float>(labels.begin() + start_idx, labels.begin() + end_idx);
+        auto loss = local_model->Predict(batch_feature, batch_label);
+        if (step % 10 == 0) {
+          std::cout << "batch: " << i << ", step: " << step
+                    << ", Loss: " << loss << endl;
+        }
+        step += 1;
+      }
+    }
+  } else {
+    cerr << "wrong arg for is_small: " << is_small << endl;
+  }
+}
diff --git a/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h b/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..050e929c9135ac939dac747e2e4a2490397a4c3d
--- /dev/null
+++ b/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <assert.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using std::string;
+using std::vector;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::min;
+using std::max;
+using std::fstream;
+
+extern int FEATURE_NUM;
+
+int get_samples(string line, const vector<float>& feature, float* label);
+int read_samples(const string fname,
+                 vector<vector<float>>* features,
+                 vector<float>* labels);
diff --git a/lite/demo/cxx/train_demo/cplus_train/run_build.sh b/lite/demo/cxx/train_demo/cplus_train/run_build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4fb444ebd1ecda40db2d69c24016cb78bacdc0ad
--- /dev/null
+++ b/lite/demo/cxx/train_demo/cplus_train/run_build.sh
@@ -0,0 +1,21 @@
+
+rm -rf build
+mkdir build
+cd build
+
+LITE_ROOT=$1
+NDK_ROOT=$2
+
+
+cmake .. \
+         -DLITE_ROOT=${LITE_ROOT} \
+         -DNDK_ROOT=${NDK_ROOT} \
+         -DCMAKE_TOOLCHAIN_FILE=${NDK_ROOT}/build/cmake/android.toolchain.cmake \
+         -DANDROID_TOOLCHAIN=gcc \
+         -DANDROID_ABI="armeabi-v7a" \
+         -DANDROID_PLATFORM=android-23 \
+         -DANDROID=true \
+         -DANDROID_STL=c++_static
+make
+cd ..
+# ./bin/demo_trainer
diff --git a/lite/demo/cxx/train_demo/image/lr_loss.png b/lite/demo/cxx/train_demo/image/lr_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..626cb57ecd5d4cf50fd4d0b8aaadcc29146ca19b
Binary files /dev/null and b/lite/demo/cxx/train_demo/image/lr_loss.png differ
diff --git a/lite/demo/cxx/train_demo/train.py b/lite/demo/cxx/train_demo/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..37825a5cc472990664f68cb38dbf7ee7859286b8
--- /dev/null
+++ b/lite/demo/cxx/train_demo/train.py
@@ -0,0 +1,135 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+import argparse
+
+import math
+import numpy
+
+import paddle
+import paddle.fluid as fluid
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("fit_a_line")
+    parser.add_argument(
+        '--save_model',
+        action='store_true',    
+        help="Whether to save main program")
+    parser.add_argument(
+        '--num_steps',
+        type=int, 
+        default=1000000000000,
+        help="train steps")
+    parser.add_argument(
+        '--num_epochs', type=int, default=100, help="number of epochs.")
+    parser.add_argument(
+        '--batch_size', type=int, default=20, help="batch size.")
+    parser.add_argument(
+        '--shuffle',
+        action='store_true',
+        help="Whether to shuffle train data.")
+    args = parser.parse_args()
+    return args
+
+# For training test cost
+def train_test(executor, program, reader, feeder, fetch_list):
+    accumulated = 1 * [0]
+    count = 0
+    for data_test in reader():
+        outs = executor.run(
+            program=program, feed=feeder.feed(data_test), fetch_list=fetch_list)
+        accumulated = [x_c[0] + x_c[1][0] for x_c in zip(accumulated, outs)]
+        count += 1
+    return [x_d / count for x_d in accumulated]
+
+
+def main():
+    if args.shuffle:
+        print("doing shuffle")
+        train_reader = paddle.batch(
+                         paddle.reader.shuffle(
+                             paddle.dataset.uci_housing.train(), buf_size=500),
+                         batch_size=args.batch_size)
+    else:
+        train_reader = paddle.batch(
+            paddle.dataset.uci_housing.train(), batch_size=args.batch_size)
+    
+    # feature vector of length 13
+    x = fluid.data(name='x', shape=[None, 13], dtype='float32')
+    y = fluid.data(name='y', shape=[None, 1], dtype='float32')
+
+    main_program = fluid.default_main_program()
+    startup_program = fluid.default_startup_program()
+
+    main_program.random_seed = 90
+    startup_program.random_seed = 90
+
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_loss = fluid.layers.mean(cost)
+
+    test_program = main_program.clone(for_test=True)
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_loss)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    num_epochs = args.num_epochs
+
+    # main train loop.
+    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+    exe.run(startup_program)
+    if args.save_model:
+        fluid.io.save_persistables(exe, "model_dir")
+
+        # add feed and fetch op
+        feeded_var_names = ['x', 'y']
+        fetch_var_names = ['mean_0.tmp_0']
+        fluid.io.prepend_feed_ops(main_program, feeded_var_names)
+        fluid.io.append_fetch_ops(main_program, fetch_var_names)
+        with open("model_dir/__model__", "wb") as f:
+            f.write(main_program.desc.serialize_to_string())
+
+        with open("debug_main_program", "w") as f:
+            f.write(str(main_program))
+        print("train model saved to model_dir")
+        return
+
+    train_prompt = "Train cost"
+    step = 0 
+    for pass_id in range(num_epochs):
+        for data_train in train_reader():
+            avg_loss_value, = exe.run(
+                main_program,
+                feed=feeder.feed(data_train),
+                fetch_list=[avg_loss])
+            print("%s, Step %d, Cost %f" %
+                      (train_prompt, step, avg_loss_value[0]))
+            if step  == args.num_steps - 1:
+                return
+            step += 1
+
+            if math.isnan(float(avg_loss_value[0])):
+                sys.exit("got NaN loss, training failed.")
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main()
diff --git a/lite/demo/python/mobilenetv1_full_api.py b/lite/demo/python/mobilenetv1_full_api.py
index a31469e3e8da81f3753dc5d241d4ef39ac03832f..c3a6bd077be5978f1ecaf9b040b119e50117d62b 100644
--- a/lite/demo/python/mobilenetv1_full_api.py
+++ b/lite/demo/python/mobilenetv1_full_api.py
@@ -23,7 +23,7 @@ import argparse
 import sys
 sys.path.append('../../python/lib')
 
-from lite_core import *
+from paddlelite.lite import *
 
 # Command arguments
 parser = argparse.ArgumentParser()
diff --git a/lite/demo/python/mobilenetv1_light_api.py b/lite/demo/python/mobilenetv1_light_api.py
index a44427092bae88aa41b3b1d0684cfcf36835b3d2..5847c7819366b654dd9d5b5cbe2108b54da7b04c 100644
--- a/lite/demo/python/mobilenetv1_light_api.py
+++ b/lite/demo/python/mobilenetv1_light_api.py
@@ -23,7 +23,7 @@ import argparse
 import sys
 sys.path.append('../../python/lib')
 
-from lite_core import *
+from paddlelite.lite import *
 
 # Command arguments
 parser = argparse.ArgumentParser()
diff --git a/lite/fluid/data_type.cc b/lite/fluid/data_type.cc
index d33a77c4bfcefbc349d453de05dcbb7c27707a19..9c96459993e55b441ea795c4f2cb58f40846c0d9 100644
--- a/lite/fluid/data_type.cc
+++ b/lite/fluid/data_type.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "lite/fluid/data_type.h"
 #include <stdint.h>
 #include <string>
diff --git a/lite/fluid/lod.h b/lite/fluid/lod.h
index 36386f7eb967f31ec258681fe17222a928aa7b4b..b1f2f14a0a4534e588d18237826858812740db69 100644
--- a/lite/fluid/lod.h
+++ b/lite/fluid/lod.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace lite {
 namespace fluid {
-using LoD = std::vector<std::vector<size_t>>;
+using LoD = std::vector<std::vector<uint64_t>>;
 
 static LoD ToAbsOffset(const LoD &in) {
   // the lowest level stores relative offsets
diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt
index 40c95415546d99a66abf2d6f3595ae8695c4df86..2416278ad74068d28f6de523c55513891b08cc72 100644
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
         X86_DEPS ${x86_kernels}
         ARM_DEPS ${arm_kernels}
         NPU_DEPS ${npu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
@@ -43,6 +44,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
     X86_DEPS ${x86_kernels}
     ARM_DEPS ${arm_kernels}
     NPU_DEPS ${npu_kernels}
+    RKNPU_DEPS ${rknpu_kernels}
     XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
     FPGA_DEPS ${fpga_kernels}
diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt
index 78bb8d10b798b73861ddbf25e427289fc2984a55..b00e818c6cd21de717dab7b896a8f757b5b0011a 100644
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -12,3 +12,4 @@ add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
 add_subdirectory(bm)
+add_subdirectory(rknpu)
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 7550d770145d92ebd343f96a82c6f34d72c91ea5..83c85842f90900496e1a0ed4149a47234899d2f9 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -63,7 +63,6 @@ add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps
 add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -92,7 +91,6 @@ add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_
 add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -106,13 +104,12 @@ add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math
 
 # 4. training kernels
 add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
-if(LITE_WITH_TRAIN)
-  add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
-  add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
-  add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
-  add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
-  add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
-endif()
+
+add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc
index d609716ee53ec584b8340e9b72498ed95afd5820..085e914c6e05c26d3031a4cfdac3c39d31f40f6d 100644
--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
@@ -179,6 +179,44 @@ void SquareCompute::Run() {
       x_data, output_data, x_dims.production(), ctx.threads());
 }
 
+void HardSwishCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  float threshold = param.hard_swish_threshold;
+  float scale = param.hard_swish_scale;
+  float offset = param.hard_swish_offset;
+  lite::arm::math::act_hard_swish<float>(x_data,
+                                         output_data,
+                                         x_dims.production(),
+                                         threshold,
+                                         scale,
+                                         offset,
+                                         ctx.threads());
+}
+
+void ReciprocalCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_reciprocal<float>(
+      x_data, output_data, x_dims.production(), ctx.threads());
+}
+
+void AbsCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_abs<float>(
+      x_data, output_data, x_dims.production(), ctx.threads());
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -275,3 +313,26 @@ REGISTER_LITE_KERNEL(
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+REGISTER_LITE_KERNEL(hard_swish,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::HardSwishCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+REGISTER_LITE_KERNEL(reciprocal,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ReciprocalCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    abs, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AbsCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h
index 476d7bb0a32db193d9afb1451507699d0af71736..2e9774637b7a9156197ffeff5f4bca13a20620bb 100644
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
@@ -148,6 +148,33 @@ class SquareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
   virtual ~SquareCompute() = default;
 };
 
+class HardSwishCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~HardSwishCompute() = default;
+};
+
+class ReciprocalCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~ReciprocalCompute() = default;
+};
+
+class AbsCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~AbsCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/arm/compare_compute.cc b/lite/kernels/arm/compare_compute.cc
deleted file mode 100644
index 709942a0d9f385e4ba55be32657633c0edc378cf..0000000000000000000000000000000000000000
--- a/lite/kernels/arm/compare_compute.cc
+++ /dev/null
@@ -1,295 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/compare_compute.h"
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-#define COMPARE_FUNCTOR(name, op)                                           \
-  template <typename T>                                                     \
-  struct _##name##Functor {                                                 \
-    inline bool operator()(const T &a, const T &b) const { return a op b; } \
-  };
-
-COMPARE_FUNCTOR(Equal, ==);
-COMPARE_FUNCTOR(NotEqual, !=);
-COMPARE_FUNCTOR(LessThan, <);
-COMPARE_FUNCTOR(LessEqual, <=);
-COMPARE_FUNCTOR(GreaterThan, >);
-COMPARE_FUNCTOR(GreaterEqual, >=);
-
-template <>
-struct _EqualFunctor<float> {
-  inline bool operator()(const float &a, const float &b) const {
-    // It is safe to cast a and b to double.
-    return fabs(static_cast<double>(a - b)) < 1e-8;
-  }
-};
-
-template <>
-struct _NotEqualFunctor<float> {
-  inline bool operator()(const float &a, const float &b) const {
-    return !_EqualFunctor<float>()(a, b);
-  }
-};
-
-inline void get_mid_dims(const lite::DDim &x_dims,
-                         const lite::DDim &y_dims,
-                         const int axis,
-                         int *pre,
-                         int *n,
-                         int *post) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    (*n) *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    (*post) *= x_dims[i];
-  }
-}
-
-template <template <typename T> class Functor>
-void CompareCompute<Functor>::Run() {
-  auto &param = this->Param<operators::CompareParam>();
-
-  using CompareFunctor = Functor<float>;
-
-  const size_t x_size = param.X->numel();
-  const size_t y_size = param.Y->numel();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  bool *z = param.Out->template mutable_data<bool>();
-  const auto *x = param.X->template data<float>();
-  const auto *y = param.Y->template data<float>();
-  auto axis = param.axis;
-  bool force_cpu = param.force_cpu;
-  if (x_size == y_size) {
-    for (int i = 0; i < x_size; ++i) {
-      z[i] = CompareFunctor()(x[i], y[i]);
-    }
-  } else {
-    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
-    int outer_num, mid_num, inner_num;
-    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
-    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
-      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
-        auto y_data = y[mid_id];
-        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
-          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
-          z[index] = CompareFunctor()(x[index], y_data);
-          // z[index] = x[index] < y_data;
-        }
-      }
-    }
-  }
-}
-
-template <template <typename T> class Functor>
-void CompareCompute_int32<Functor>::Run() {
-  auto &param = this->Param<operators::CompareParam>();
-
-  using CompareFunctor = Functor<int>;
-
-  const size_t x_size = param.X->numel();
-  const size_t y_size = param.Y->numel();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  bool *z = param.Out->template mutable_data<bool>();
-  const auto *x = param.X->template data<int>();
-  const auto *y = param.Y->template data<int>();
-  auto axis = param.axis;
-  bool force_cpu = param.force_cpu;
-  if (x_size == y_size) {
-    for (int i = 0; i < x_size; ++i) {
-      z[i] = CompareFunctor()(x[i], y[i]);
-    }
-  } else {
-    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
-    int outer_num, mid_num, inner_num;
-    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
-    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
-      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
-        auto y_data = y[mid_id];
-        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
-          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
-          z[index] = CompareFunctor()(x[index], y_data);
-          // z[index] = x[index] < y_data;
-        }
-      }
-    }
-  }
-}
-
-template <template <typename T> class Functor>
-void CompareCompute_int64<Functor>::Run() {
-  auto &param = this->Param<operators::CompareParam>();
-
-  using CompareFunctor = Functor<int64_t>;
-
-  const size_t x_size = param.X->numel();
-  const size_t y_size = param.Y->numel();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  bool *z = param.Out->template mutable_data<bool>();
-  const auto *x = param.X->template data<int64_t>();
-  const auto *y = param.Y->template data<int64_t>();
-  auto axis = param.axis;
-  bool force_cpu = param.force_cpu;
-  if (x_size == y_size) {
-    for (int i = 0; i < x_size; ++i) {
-      z[i] = CompareFunctor()(x[i], y[i]);
-    }
-  } else {
-    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
-    int outer_num, mid_num, inner_num;
-    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
-    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
-      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
-        auto y_data = y[mid_id];
-        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
-          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
-          z[index] = CompareFunctor()(x[index], y_data);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_EqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(equal,
-                     kARM,
-                     kInt32,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int32<
-                         paddle::lite::kernels::arm::_EqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(not_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_NotEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_than,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_than,
-                     kARM,
-                     kInt32,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int32<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_than,
-                     kARM,
-                     kInt64,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int64<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_LessEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(greater_than,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_GreaterThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(greater_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_GreaterEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
diff --git a/lite/kernels/arm/concat_compute.cc b/lite/kernels/arm/concat_compute.cc
index eb59affea37eb7e2979f37c860eae9d399b73093..dc78e1b955c29b261b2103479ea00bb836c0a31f 100644
--- a/lite/kernels/arm/concat_compute.cc
+++ b/lite/kernels/arm/concat_compute.cc
@@ -34,40 +34,21 @@ std::vector<size_t> stride_numel(const DDim& ddim) {
   return strides;
 }
 
-void ConcatCompute::Run() {
-  auto& param = Param<operators::ConcatParam>();
-  std::vector<lite::Tensor*> inputs = param.x;
-  auto* out = param.output;
-  int axis = param.axis;
-  auto* axis_tensor = param.axis_tensor;
-  if (axis_tensor != nullptr) {
-    auto* axis_tensor_data = axis_tensor->data<int>();
-    axis = axis_tensor_data[0];
-  }
-  out->mutable_data<float>();
-
-  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
+template <typename T>
+void ConcatFunc(const std::vector<lite::Tensor*> inputs,
+                int axis,
+                lite::Tensor* out) {
+  // Sometimes direct copies will be faster, this maybe need deeply analysis.
   if (axis == 0 && inputs.size() < 10) {
     size_t output_offset = 0;
     for (auto* in : inputs) {
       auto in_stride = stride_numel(in->dims());
       auto out_stride = stride_numel(out->dims());
-      void* dst = out->mutable_data<float>() + output_offset;
-      const void* src = in->data<float>();
-#if 0
-      LOG(INFO) << "out_stride.size():" << out_stride.size();
-      LOG(INFO) << "out_stride[0]" << out_stride[0];
-      for (int i=0; i < out_stride.size(); ++i) {
-        LOG(INFO) << "out_stride[" << i << "]:" << out_stride[i];
-      }
-      LOG(INFO) << "in_stride.size():" << in_stride.size();
-      for (int i=0; i < in_stride.size(); ++i) {
-        LOG(INFO) << "in_stride[" << i << "]:" << in_stride[i];
-      }
-#endif
+      void* dst = out->mutable_data<T>() + output_offset;
+      const void* src = in->data<T>();
       // src and dst tensor should have the same dims size.
       CHECK(in_stride.size() == out_stride.size());
-      std::memcpy(dst, src, sizeof(float) * in_stride[0]);
+      std::memcpy(dst, src, sizeof(T) * in_stride[0]);
       output_offset += in_stride[0];
     }
   } else {
@@ -75,9 +56,37 @@ void ConcatCompute::Run() {
     for (int j = 0; j < inputs.size(); ++j) {
       inputs_concat[j] = inputs[j];
     }
-    lite::arm::math::concat_func(inputs_concat, axis, out);
+    lite::arm::math::concat_func<T>(inputs_concat, axis, out);
+  }
+}
+
+void ConcatCompute::Run() {
+  auto& param = Param<operators::ConcatParam>();
+  std::vector<lite::Tensor*> inputs = param.x;
+  CHECK_GE(inputs.size(), 1);
+  auto* out = param.output;
+  int axis = param.axis;
+  auto* axis_tensor = param.axis_tensor;
+  if (axis_tensor != nullptr) {
+    auto* axis_tensor_data = axis_tensor->data<int>();
+    axis = axis_tensor_data[0];
+  }
+
+  switch (inputs.front()->precision()) {
+    case PRECISION(kFloat):
+      ConcatFunc<float>(inputs, axis, out);
+      break;
+    case PRECISION(kInt32):
+      ConcatFunc<int32_t>(inputs, axis, out);
+      break;
+    case PRECISION(kInt64):
+      ConcatFunc<int64_t>(inputs, axis, out);
+      break;
+    default:
+      LOG(FATAL) << "Concat does not implement for the "
+                 << "input type:"
+                 << static_cast<int>(inputs.front()->precision());
   }
-  return;
 }
 
 }  // namespace arm
@@ -86,9 +95,9 @@ void ConcatCompute::Run() {
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(
-    concat, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    concat, kARM, kAny, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindInput("AxisTensor",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/concat_compute.h b/lite/kernels/arm/concat_compute.h
index d692140420b5ff2436f286c19491f857871eb6c7..7c96279fd2388a26d4ccb7cf3b2bba9d11aa08b4 100644
--- a/lite/kernels/arm/concat_compute.h
+++ b/lite/kernels/arm/concat_compute.h
@@ -22,7 +22,7 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class ConcatCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class ConcatCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::ConcatParam;
 
diff --git a/lite/kernels/arm/concat_compute_test.cc b/lite/kernels/arm/concat_compute_test.cc
index a3131f68924121a44707e290ddbe29cb2b086e4b..44c6dedd44ad4509a3f5a9c13fc04d6f1ffbdc64 100644
--- a/lite/kernels/arm/concat_compute_test.cc
+++ b/lite/kernels/arm/concat_compute_test.cc
@@ -95,7 +95,7 @@ void concat_compute_ref(const operators::ConcatParam& param) {
 
 TEST(concat_arm, init) {
   ConcatCompute concat;
-  ASSERT_EQ(concat.precision(), PRECISION(kFloat));
+  ASSERT_EQ(concat.precision(), PRECISION(kAny));
   ASSERT_EQ(concat.target(), TARGET(kARM));
 }
 
@@ -222,8 +222,7 @@ TEST(concat_arm, compute_input_multi) {
 
 TEST(concat, retrive_op) {
   auto concat =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "concat");
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kAny)>("concat");
   ASSERT_FALSE(concat.empty());
   ASSERT_TRUE(concat.front());
 }
@@ -233,4 +232,4 @@ TEST(concat, retrive_op) {
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kARM, kAny, kNCHW, def);
diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc
index 90fe63662bd4ff7ea147707ae2e91ffbaff4478d..e433a3f4bb4a7aa553fbb1193ff82779d9af3242 100644
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -35,7 +35,8 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   if (last_shape_ == x_dims) {
     return;
   }
-
+  last_shape_ = x_dims;
+  //! update workspace size
   int ic = x_dims[1];
   int ih = x_dims[2];
   int iw = x_dims[3];
@@ -43,6 +44,20 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   int oh = o_dims[2];
   int ow = o_dims[3];
   int tile_block = 8;
+  auto pad = *(param.paddings);
+  int pad_h = pad[0];
+  int pad_w = pad[2];
+  int oc_pad = (oc + 3) / 4 * 4;
+  int ic_pad = (ic + 3) / 4 * 4;
+  const int new_input_size =
+      (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2);
+  const int temp_size =
+      (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 4 * wino_iw * wino_iw +
+       8 * wino_iw * wino_iw) *
+      threads;
+  workspace_size_ = (temp_size + new_input_size) * sizeof(float);
+
+  //! update trans weights impl
   choose_small_ = ow * oh / (tile_block * threads) < 36 ? true : false;
   if (choose_small_) {
     wino_iw = 4;
@@ -58,18 +73,7 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
     }
     last_function_ = 1;
   }
-  auto pad = *(param.paddings);
-  int pad_h = pad[0];
-  int pad_w = pad[2];
-  int oc_pad = (oc + 3) / 4 * 4;
-  int ic_pad = (ic + 3) / 4 * 4;
-  const int new_input_size =
-      (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2);
-  const int temp_size =
-      (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 4 * wino_iw * wino_iw +
-       8 * wino_iw * wino_iw) *
-      threads;
-  workspace_size_ = (temp_size + new_input_size) * sizeof(float);
+
   weights_.Resize({1, 1, 1, wino_iw * wino_iw * oc_pad * ic_pad});
   void* trans_tmp_ptr = malloc(sizeof(float) * wino_iw * wino_iw * oc * ic);
   auto weights_data_ = weights_.mutable_data<float>();
@@ -81,8 +85,6 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
         weights_data_, param.filter->data<float>(), ic, oc, trans_tmp_ptr);
   }
   free(trans_tmp_ptr);
-
-  last_shape_ = x_dims;
 }
 
 template <>
diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.cc b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
index 0871a3e84b42c8bcabbad53a8e98dc1d220714fb..3621c01a3018db9fa817d6a94c61e7e3373a81e4 100644
--- a/lite/kernels/arm/distribute_fpn_proposals_compute.cc
+++ b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
@@ -71,9 +71,9 @@ void DistributeFpnProposalsCompute::Run() {
     for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
       // get the target level of current rois
       float roi_scale = std::sqrt(BBoxArea(rois_data, false));
-      int tgt_lvl = std::floor(
-          std::log2(roi_scale / refer_scale + static_cast<float>(1e-6)) +
-          refer_level);
+      int tgt_lvl =
+          std::floor(log2(roi_scale / refer_scale + static_cast<float>(1e-6)) +
+                     refer_level);
       tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
       target_level.push_back(tgt_lvl);
       num_rois_level[tgt_lvl - min_level]++;
diff --git a/lite/kernels/arm/elementwise_grad_compute.cc b/lite/kernels/arm/elementwise_grad_compute.cc
index 93bc5853459005137ef4f948f3a5892d76441b7c..84f9157201ad5c010cd7bae7f9c43651c747a1b1 100644
--- a/lite/kernels/arm/elementwise_grad_compute.cc
+++ b/lite/kernels/arm/elementwise_grad_compute.cc
@@ -76,8 +76,8 @@ void ElementwiseAddGradCompute::Run() {
   const float* x_data = param.X->data<float>();
   const float* y_data = param.Y->data<float>();
   const float* out_grad_data = param.OutGrad->data<float>();
-  float* x_grad_data;
-  float* y_grad_data;
+  float* x_grad_data = nullptr;
+  float* y_grad_data = nullptr;
   if (param.XGrad) {
     x_grad_data = param.XGrad->mutable_data<float>();
   }
@@ -122,8 +122,8 @@ void ElementwiseSubGradCompute::Run() {
   const float* x_data = param.X->data<float>();
   const float* y_data = param.Y->data<float>();
   const float* out_data = param.OutGrad->data<float>();
-  float* x_grad_data;
-  float* y_grad_data;
+  float* x_grad_data = nullptr;
+  float* y_grad_data = nullptr;
   if (param.XGrad) {
     x_grad_data = param.XGrad->mutable_data<float>();
   }
@@ -137,9 +137,15 @@ void ElementwiseSubGradCompute::Run() {
 
   if (!param.XGrad || !param.YGrad) {
     CHECK(param.XGrad || param.YGrad);
-    lite::arm::math::elementwise_sub_grad(
-        out_data, x_grad_data, y_grad_data, y_dims.production());
-    return;
+    if (param.XGrad) {
+      lite::arm::math::elementwise_sub_grad(
+          out_data, x_grad_data, y_grad_data, x_dims.production());
+      return;
+    } else {
+      lite::arm::math::elementwise_sub_grad(
+          out_data, x_grad_data, y_grad_data, y_dims.production());
+      return;
+    }
   }
 
   if (x_dims.size() < y_dims.size()) {
diff --git a/lite/kernels/arm/gather_compute.cc b/lite/kernels/arm/gather_compute.cc
index af2abbcb2ed26422331235666a98c74923057b8d..3efacc4aacefcb150d53738c950ec9e797ed78c7 100644
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -20,24 +20,48 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void GatherCompute::Run() {
-  auto& param = this->Param<operators::GatherParam>();
-
-  auto* p_output = param.Out->mutable_data<float>();
-  auto index_size = param.Index->dims()[0];
+template <typename T>
+void GatherFunc(const operators::GatherParam& param) {
   auto src_dims = param.X->dims();
-  const float* p_src = param.X->data<float>();
+  auto index_size = param.Index->dims()[0];
+  auto* p_src = param.X->data<T>();
   const int* p_index = param.Index->data<int>();
+  auto* p_output = param.Out->mutable_data<T>();
 
   int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) {
+  for (size_t i = 1; i < src_dims.size(); ++i) {
     slice_size *= src_dims[i];
   }
   for (int i = 0; i < index_size; ++i) {
     int index_ = p_index[i];
     memcpy(p_output + i * slice_size,
            p_src + index_ * slice_size,
-           slice_size * sizeof(float));
+           slice_size * sizeof(T));
+  }
+}
+
+void GatherCompute::Run() {
+  auto& param = this->Param<operators::GatherParam>();
+
+  switch (param.X->precision()) {
+    case PRECISION(kFloat):
+      GatherFunc<float>(param);
+      break;
+    case PRECISION(kInt8):
+      GatherFunc<int8_t>(param);
+      break;
+    case PRECISION(kInt16):
+      GatherFunc<int16_t>(param);
+      break;
+    case PRECISION(kInt32):
+      GatherFunc<int32_t>(param);
+      break;
+    case PRECISION(kInt64):
+      GatherFunc<int64_t>(param);
+      break;
+    default:
+      LOG(FATAL) << "Gather does not implement for the "
+                 << "input type:" << static_cast<int>(param.X->precision());
   }
 }
 
@@ -48,8 +72,8 @@ void GatherCompute::Run() {
 
 REGISTER_LITE_KERNEL(
     gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/matmul_compute.cc b/lite/kernels/arm/matmul_compute.cc
index 2841fa13f7a04026bc9040a8bd9fdc98dd7e149e..d22b14155a981f5fd37f0d7f27ebf422e851f65c 100644
--- a/lite/kernels/arm/matmul_compute.cc
+++ b/lite/kernels/arm/matmul_compute.cc
@@ -45,32 +45,13 @@ void MatMulCompute::Run() {
   operators::ActivationParam act_param;
   act_param.has_active = false;
 
-  if (x_dims.size() > 2 && y_dims.size() >= 2) {
+  if ((x_dims.size() >= 2 && y_dims.size() >= 2) &&
+      (x_dims.size() != 2 || y_dims.size() != 2)) {
     // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
     // x: [B, M, K], y: [K, N], out: [B, M, N]
-
-    if (!x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 2])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    } else if (!x_transpose && y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    } else if (x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 2])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    } else {
-      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    }
-
+    // or
+    // x: [M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [M, K], y: [B, K, N], out: [B, M, N]
     int lda, ldb, ldc;
     if (!x_transpose) {
       m_ = x_dims[x_dims.size() - 2];
@@ -96,11 +77,7 @@ void MatMulCompute::Run() {
     int y_inner = y_dims[y_dims.size() - 2] * y_dims[y_dims.size() - 1];
     int out_inner = o_dims[o_dims.size() - 2] * o_dims[o_dims.size() - 1];
 
-    float* x_data_trans = nullptr;
-    if (x_transpose) {
-      x_data_trans = static_cast<float*>(malloc(sizeof(float) * x_inner));
-    }
-    if (y_dims.size() > 2) {
+    if (x_dims.size() > 2 && y_dims.size() > 2) {
       for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
         lite::arm::math::sgemm(x_transpose,
                                y_transpose,
@@ -120,7 +97,7 @@ void MatMulCompute::Run() {
                                act_param,
                                &ctx);
       }
-    } else {
+    } else if (x_dims.size() > 2 && y_dims.size() == 2) {
       for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
         lite::arm::math::sgemm(x_transpose,
                                y_transpose,
@@ -140,34 +117,29 @@ void MatMulCompute::Run() {
                                act_param,
                                &ctx);
       }
-    }
-    if (x_data_trans) {
-      free(x_data_trans);
+    } else if (x_dims.size() == 2 && y_dims.size() > 2) {
+      for (size_t i = 0; i < y_dims.count(0, y_dims.size() - 2); ++i) {
+        lite::arm::math::sgemm(x_transpose,
+                               y_transpose,
+                               m_,
+                               n_,
+                               k_,
+                               alpha,
+                               x_data,
+                               lda,
+                               y_data + i * y_inner,
+                               ldb,
+                               0.f,
+                               o_data + i * out_inner,
+                               ldc,
+                               nullptr,
+                               false,
+                               act_param,
+                               &ctx);
+      }
     }
   } else if (x_dims.size() == 2 && y_dims.size() == 2) {
     // x: [M, K], y: [K, N], out: [M, N]
-    if (!x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (!x_transpose && y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[0], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else {
-      CHECK_EQ(x_dims[0], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    }
-
     int lda, ldb, ldc;
     if (!x_transpose) {
       m_ = x_dims[0];
diff --git a/lite/kernels/arm/mean_grad_compute.cc b/lite/kernels/arm/mean_grad_compute.cc
index f7a5be8be1ebd4e02a188ab40026de04b319c76e..f72ccf47dba0c0e9d0a4e793f4b582c106cfeecd 100644
--- a/lite/kernels/arm/mean_grad_compute.cc
+++ b/lite/kernels/arm/mean_grad_compute.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/arm/mean_grad_compute.h"
-
+#include "lite/backends/arm/math/reduce_mean.h"
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -31,10 +31,7 @@ void MeanGradCompute::Run() {
 
   int input_grad_size = input_grad->dims().production();
 
-  // TODO(mapingshuo): use parallel methods to accelerate this for loop
-  for (int i = 0; i < input_grad_size; i++) {
-    input_grad_data[i] = out_grad_data[0] / input_grad_size;
-  }
+  lite::arm::math::mean_grad(out_grad_data, input_grad_data, input_grad_size);
 }
 
 }  // namespace arm
diff --git a/lite/kernels/arm/sequence_pool_compute.cc b/lite/kernels/arm/sequence_pool_compute.cc
index 8fcbb8cffe72935e4df503c3c1748ddb68247fb7..53fa5477036757fa70135569129fee115eb52047 100644
--- a/lite/kernels/arm/sequence_pool_compute.cc
+++ b/lite/kernels/arm/sequence_pool_compute.cc
@@ -59,7 +59,8 @@ void SequencePoolCompute::Run() {
   for (int i = 0; i <= batch_size; i++) {
     offset_new[i] = i;
   }
-  (output->mutable_lod())->push_back(offset_new);
+  output->mutable_lod()->clear();
+  output->mutable_lod()->push_back(offset_new);
 }
 
 }  // namespace arm
diff --git a/lite/kernels/bm/bridges/CMakeLists.txt b/lite/kernels/bm/bridges/CMakeLists.txt
index 75375f493fe9b6b1f436ef679a7ea8bd80e5ad0a..1985e76cde755fabeff3ddd2d589ed6cb0e416cf 100644
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -30,6 +30,11 @@ lite_cc_library(subgraph_bridge_conv_transpose_op_bm SRCS conv_transpose_op.cc D
 lite_cc_library(subgraph_bridge_reduce_full_op_bm SRCS reduce_full_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_squeeze_op_bm SRCS squeeze_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_cast_op_bm SRCS cast_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_fill_constant_op_bm SRCS fill_constant_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_assign_value_op_bm SRCS assign_value_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_shape_op_bm SRCS shape_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_split_op_bm SRCS split_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_matmul_op_bm SRCS matmul_op.cc DEPS ${bm_subgraph_bridge_deps})
 
 set(bm_subgraph_bridges
         subgraph_bridge_registry
@@ -58,4 +63,9 @@ set(bm_subgraph_bridges
         subgraph_bridge_reduce_full_op_bm
         subgraph_bridge_squeeze_op_bm
         subgraph_bridge_cast_op_bm
+        subgraph_bridge_fill_constant_op_bm
+        subgraph_bridge_assign_value_op_bm
+        subgraph_bridge_shape_op_bm
+        subgraph_bridge_split_op_bm
+        subgraph_bridge_matmul_op_bm
         CACHE INTERNAL "bm_subgraph_bridges")
diff --git a/lite/kernels/bm/bridges/act_op.cc b/lite/kernels/bm/bridges/act_op.cc
index 091743157995ab1a00e798a6ac560454d4b22ae7..1739dd4185ebcff6a35e2f75c5f8c84ceebd2f0a 100644
--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
 #include <bmcompiler_op_code.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -35,16 +36,14 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto output_var_name = op_info->Output("Out").front();
   auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
   auto output_dims = output->dims();
-  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
-  const int64_t* output_shape_data =
-      const_cast<const int64_t*>(&output_dims.data()[0]);
+  bool x_is_const = !graph->HasNode(x_var_name);
   std::vector<int32_t> i_x_shape_data(x_dims.size());
   std::vector<int32_t> i_output_shape_data(output_dims.size());
   for (size_t i = 0; i < x_dims.size(); i++) {
-    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+    i_x_shape_data[i] = x_dims[i];
   }
   for (size_t i = 0; i < output_dims.size(); i++) {
-    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+    i_output_shape_data[i] = output_dims[i];
   }
   float alpha = 0.f;
   int active_type_id = 0;
@@ -59,6 +58,15 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     LOG(FATAL) << "[BM] unsupport act type";
     return FAILED;
   }
+  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
+  if (x_is_const) {
+    bm_add_const_tensor(graph->GetCompilerHandle(),
+                        static_cast<const char*>(x_var_name.c_str()),
+                        const_cast<const int*>(&i_x_shape_data[0]),
+                        x_dims.size(),
+                        static_cast<bm_data_type_t>(DTYPE_FP32),
+                        static_cast<const void*>(x_data));
+  }
   if (op_type == "relu" || op_type == "leaky_relu") {
     add_relu_layer(graph->GetCompilerHandle(),
                    const_cast<const int*>(&i_x_shape_data[0]),
diff --git a/lite/kernels/bm/bridges/assign_value_op.cc b/lite/kernels/bm/bridges/assign_value_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f7b05da82a8d20e7f0eee64a4a130772504f914c
--- /dev/null
+++ b/lite/kernels/bm/bridges/assign_value_op.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int AssignValueConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  int buffer_size = 1;
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_dims[i]);
+    buffer_size *= i_output_shape_data[i];
+  }
+  std::vector<float> fp32_values;
+  std::vector<int> int32_values;
+  float* assign_data =
+      reinterpret_cast<float*>(malloc(buffer_size * sizeof(float)));
+  CHECK(assign_data != nullptr);
+  bm_data_type_t data_type = static_cast<bm_data_type_t>(DTYPE_FP32);
+  fp32_values = op_info->GetAttr<std::vector<float>>("fp32_values");
+  if (0 != fp32_values.size()) {
+    for (int i = 0; i < fp32_values.size(); i++) {
+      assign_data[i] = fp32_values[i];
+    }
+  } else {
+    int32_values = op_info->GetAttr<std::vector<int>>("int32_values");
+    data_type = static_cast<bm_data_type_t>(DTYPE_INT32);
+    CHECK_EQ(buffer_size, int32_values.size());
+    for (int i = 0; i < int32_values.size(); i++) {
+      assign_data[i] = int32_values[i];
+    }
+  }
+
+  bm_add_const_tensor(graph->GetCompilerHandle(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      const_cast<const int*>(i_output_shape_data.data()),
+                      output_dims.size(),
+                      data_type,
+                      reinterpret_cast<const void*>(assign_data));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(assign_value,
+                         kBM,
+                         paddle::lite::subgraph::bm::AssignValueConverter);
diff --git a/lite/kernels/bm/bridges/conv_op.cc b/lite/kernels/bm/bridges/conv_op.cc
index e4dff107024c02dcfe25afe37723b7d2418369b5..5deddb444e0eb88b5f61419e397d012d5f3898f5 100644
--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
@@ -39,6 +39,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto filter_var_name = op_info->Input("Filter").front();
   auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
   auto filter_dims = filter->dims();
+
   CHECK_EQ(input_dims.size(), 4);
   CHECK_EQ(output_dims.size(), 4);
   CHECK_EQ(filter_dims.size(), 4);
diff --git a/lite/kernels/bm/bridges/conv_transpose_op.cc b/lite/kernels/bm/bridges/conv_transpose_op.cc
index b875feaa03297c39f9f34ca4b710ea40c1b3ad8a..ba6a4dcd918d193905451ca2e8ba63d433d183ff 100644
--- a/lite/kernels/bm/bridges/conv_transpose_op.cc
+++ b/lite/kernels/bm/bridges/conv_transpose_op.cc
@@ -108,3 +108,6 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose,
                          kBM,
                          paddle::lite::subgraph::bm::ConvTransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d_transpose,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConvTransposeConverter);
diff --git a/lite/kernels/bm/bridges/fill_constant_op.cc b/lite/kernels/bm/bridges/fill_constant_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..835ccf0eb4e72ad945b7a24643190ff49f0b5723
--- /dev/null
+++ b/lite/kernels/bm/bridges/fill_constant_op.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int FillConstantConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  int buffer_size = 1;
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_dims[i]);
+  }
+  float* const_data =
+      reinterpret_cast<float*>(malloc(buffer_size * sizeof(float)));
+  CHECK(const_data != nullptr);
+  auto value = op_info->GetAttr<float>("value");
+  for (size_t i = 0; i < buffer_size; i++) {
+    const_data[i] = value;
+  }
+  bm_add_const_tensor(graph->GetCompilerHandle(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      const_cast<const int*>(i_output_shape_data.data()),
+                      output_dims.size(),
+                      static_cast<bm_data_type_t>(DTYPE_FP32),
+                      reinterpret_cast<const void*>(const_data));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(fill_constant,
+                         kBM,
+                         paddle::lite::subgraph::bm::FillConstantConverter);
diff --git a/lite/kernels/bm/bridges/interpolate_op.cc b/lite/kernels/bm/bridges/interpolate_op.cc
index 8a744d5f2a894efc725979fd1b4f2c8af7cb0816..8c2d39b16ac0206d83199fdeac6c30a0a352856e 100644
--- a/lite/kernels/bm/bridges/interpolate_op.cc
+++ b/lite/kernels/bm/bridges/interpolate_op.cc
@@ -54,6 +54,7 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   } else {
     type = 0;
   }
+  is_int = false;
   if (type == 2 && is_int) {
     add_upsample_layer(graph->GetCompilerHandle(),
                        const_cast<const int*>(&i_x_shape_data[0]),
diff --git a/lite/kernels/bm/bridges/matmul_op.cc b/lite/kernels/bm/bridges/matmul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7767b4e6b0a0f34b6d60abe3fb8a35de0e73dcf0
--- /dev/null
+++ b/lite/kernels/bm/bridges/matmul_op.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  auto y_var_name = op_info->Input("Y").front();
+  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  auto y_dims = y->dims();
+  const int64_t* y_shape_data = const_cast<const int64_t*>(&y_dims.data()[0]);
+  std::vector<int32_t> i_y_shape_data(y_dims.size());
+  for (size_t i = 0; i < y_dims.size(); i++) {
+    i_y_shape_data[i] = static_cast<int>(y_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  bool transpose_x = op_info->GetAttr<bool>("transpose_X");
+  bool transpose_y = op_info->GetAttr<bool>("transpose_Y");
+  float alpha = op_info->GetAttr<float>("alpha");
+
+  LOG(INFO) << x_dims << " " << y_dims << " " << alpha << " " << transpose_x
+            << " " << transpose_y;
+
+#if 0
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         scale,
+                         static_cast<const char*>(unique_op_scale_name.c_str()),
+                         BINARY_MUL,
+                         0);
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(unique_op_scale_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         bias,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         BINARY_ADD,
+                         0);
+#endif
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(matmul,
+                         kBM,
+                         paddle::lite::subgraph::bm::MatMulConverter);
diff --git a/lite/kernels/bm/bridges/mul_op.cc b/lite/kernels/bm/bridges/mul_op.cc
index 06ec177bceb883758c42d45c9b07006a83b3c9f6..35e1aac7660683ff5544a7d72574167359b29fdb 100644
--- a/lite/kernels/bm/bridges/mul_op.cc
+++ b/lite/kernels/bm/bridges/mul_op.cc
@@ -29,7 +29,6 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
   auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
-  // only support y is const
   // input
   auto x_var_name = op_info->Input("X").front();
   auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
@@ -61,6 +60,12 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto y_var_name = op_info->Input("Y").front();
   auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
   auto y_dims = y->dims();
+  bool y_is_const = !graph->HasNode(y_var_name);
+  CHECK_EQ(y_dims.size(), 2);
+  int i_y_shape_data[2];
+  for (size_t i = 0; i < 2; i++) {
+    i_y_shape_data[i] = y_dims[i];
+  }
   // output
   auto output_var_name = op_info->Output("Out").front();
   auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
@@ -71,20 +76,39 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   for (size_t i = 0; i < output_dims.size(); i++) {
     i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
   }
-  add_fc_layer(graph->GetCompilerHandle(),
-               const_cast<const int*>(&i_x_reshape_shape_data[0]),
-               2,
-               static_cast<const char*>(unique_op_reshape_name.c_str()),
-               const_cast<const int*>(&i_output_shape_data[0]),
-               output_dims.size(),
-               static_cast<const char*>(output_var_name.c_str()),
-               static_cast<const char*>(unique_op_name.c_str()),
-               i_x_reshape_shape_data[1],
-               i_output_shape_data[1],
-               static_cast<const float*>(y->mutable_data<float>()),
-               nullptr,
-               0,
-               0);
+  if (y_is_const) {
+    add_fc_layer(graph->GetCompilerHandle(),
+                 const_cast<const int*>(&i_x_reshape_shape_data[0]),
+                 2,
+                 static_cast<const char*>(unique_op_reshape_name.c_str()),
+                 const_cast<const int*>(&i_output_shape_data[0]),
+                 output_dims.size(),
+                 static_cast<const char*>(output_var_name.c_str()),
+                 static_cast<const char*>(unique_op_name.c_str()),
+                 i_x_reshape_shape_data[1],
+                 i_output_shape_data[1],
+                 static_cast<const float*>(y->mutable_data<float>()),
+                 nullptr,
+                 0,
+                 0);
+  } else {
+    add_fc_weight_layer(
+        graph->GetCompilerHandle(),
+        const_cast<const int*>(&i_x_reshape_shape_data[0]),
+        2,
+        static_cast<const char*>(unique_op_reshape_name.c_str()),
+        const_cast<const int*>(&i_output_shape_data[0]),
+        output_dims.size(),
+        static_cast<const char*>(output_var_name.c_str()),
+        static_cast<const char*>(unique_op_name.c_str()),
+        const_cast<const int*>(&i_y_shape_data[0]),
+        2,
+        static_cast<const char*>(y_var_name.c_str()),
+        i_x_reshape_shape_data[1],
+        nullptr,
+        0,
+        0);
+  }
   graph->AddNode(output_var_name);
   return SUCCESS;
 }
diff --git a/lite/kernels/bm/bridges/multiclass_nms_op.cc b/lite/kernels/bm/bridges/multiclass_nms_op.cc
index 6e7520f27242271ba297b7ff02b34ea3b35a763b..51ff0e69fc4d91116ec4262cfa17bbb8a72ce0c6 100644
--- a/lite/kernels/bm/bridges/multiclass_nms_op.cc
+++ b/lite/kernels/bm/bridges/multiclass_nms_op.cc
@@ -45,14 +45,6 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     i_score_shape_data[i] = static_cast<int32_t>(score_dims[i]);
   }
 
-  auto out_var_name = op_info->Output("Out").front();
-  auto out = scope->FindVar(out_var_name)->GetMutable<lite::Tensor>();
-  auto out_dims = out->dims();
-  std::vector<int32_t> i_out_shape_data(out_dims.size());
-  for (size_t i = 0; i < out_dims.size(); i++) {
-    i_out_shape_data[i] = static_cast<int32_t>(out_dims[i]);
-  }
-
   auto background_label = op_info->GetAttr<int>("background_label");
   auto keep_top_k = op_info->GetAttr<int>("keep_top_k");
   auto nms_top_k = op_info->GetAttr<int>("nms_top_k");
@@ -64,6 +56,26 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     normalized = op_info->GetAttr<bool>("normalized");
   }
 
+  auto out_var_name = op_info->Output("Out").front();
+  auto out = scope->FindVar(out_var_name)->GetMutable<lite::Tensor>();
+  std::vector<int64_t> vec_out_dim(score_dims.size());
+  if (3 == score_dims.size()) {
+    vec_out_dim[0] = score_dims[0];  // batch_size
+    vec_out_dim[1] = keep_top_k;
+    vec_out_dim[2] = 6;
+  } else {
+    vec_out_dim[0] = keep_top_k;
+    vec_out_dim[1] = 6;
+  }
+  DDimLite out_dims(vec_out_dim);
+  out->Resize(out_dims);
+  out->mutable_data<float>();
+
+  std::vector<int32_t> i_out_shape_data(out_dims.size());
+  for (size_t i = 0; i < out_dims.size(); i++) {
+    i_out_shape_data[i] = static_cast<int32_t>(out_dims[i]);
+  }
+
   user_cpu_param_t bm_param;
   bm_param.op_type = USER_PADDLE_MULTICLASS_NMS;
   bm_param.u.multiclass_nms_param.background_label = background_label;
@@ -88,12 +100,9 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   int32_t* out_shape[1];
   int32_t out_dim[1];
   const char* out_name[1];
-  i_out_shape_data[0] = keep_top_k;
-  i_out_shape_data[1] = 6;
   out_shape[0] = &i_out_shape_data[0];
-  out_dim[0] = 2;
+  out_dim[0] = out_dims.size();
   out_name[0] = static_cast<const char*>(out_var_name.c_str());
-
   add_user_cpu_layer(graph->GetCompilerHandle(),
                      input_num,
                      in_shape,
diff --git a/lite/kernels/bm/bridges/paddle_use_bridges.h b/lite/kernels/bm/bridges/paddle_use_bridges.h
index 8dbbb53d810952743228d96d60d7927965d2d527..e644fe8b06e74168b45e1f50cb5b600082d3afce 100644
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -48,6 +48,13 @@ USE_SUBGRAPH_BRIDGE(slice, kBM);
 USE_SUBGRAPH_BRIDGE(conv2d_transpose, kBM);
 USE_SUBGRAPH_BRIDGE(reduce_sum, kBM);
 USE_SUBGRAPH_BRIDGE(reduce_mean, kBM);
+USE_SUBGRAPH_BRIDGE(reduce_max, kBM);
 USE_SUBGRAPH_BRIDGE(squeeze, kBM);
 USE_SUBGRAPH_BRIDGE(squeeze2, kBM);
 USE_SUBGRAPH_BRIDGE(cast, kBM);
+USE_SUBGRAPH_BRIDGE(fill_constant, kBM);
+USE_SUBGRAPH_BRIDGE(assign_value, kBM);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d_transpose, kBM);
+USE_SUBGRAPH_BRIDGE(shape, kBM);
+USE_SUBGRAPH_BRIDGE(split, kBM);
+USE_SUBGRAPH_BRIDGE(matmul, kBM);
diff --git a/lite/kernels/bm/bridges/reduce_full_op.cc b/lite/kernels/bm/bridges/reduce_full_op.cc
index 401de8bfac184c89edaa7bf15510576d5c05f902..3264b6d551f603de3ce4d1ef34e5aace200e20bd 100644
--- a/lite/kernels/bm/bridges/reduce_full_op.cc
+++ b/lite/kernels/bm/bridges/reduce_full_op.cc
@@ -49,6 +49,8 @@ int ReduceFullConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     op_code = REDUCE_SUM;
   } else if (op_type == "reduce_mean") {
     op_code = REDUCE_MEAN;
+  } else if (op_type == "reduce_max") {
+    op_code = REDUCE_MAX;
   }
 
   add_reduce_full_layer(graph->GetCompilerHandle(),
@@ -75,3 +77,6 @@ REGISTER_SUBGRAPH_BRIDGE(reduce_sum,
 REGISTER_SUBGRAPH_BRIDGE(reduce_mean,
                          kBM,
                          paddle::lite::subgraph::bm::ReduceFullConverter);
+REGISTER_SUBGRAPH_BRIDGE(reduce_max,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReduceFullConverter);
diff --git a/lite/kernels/bm/bridges/shape_op.cc b/lite/kernels/bm/bridges/shape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb5da0013f1eb12f4a625264aff2b19f48cc1247
--- /dev/null
+++ b/lite/kernels/bm/bridges/shape_op.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ShapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_var_name = op_info->Input("Input").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int32_t>(x_dims[i]);
+  }
+  add_shape_ref_layer(graph->GetCompilerHandle(),
+                      static_cast<const char*>(x_var_name.c_str()),
+                      const_cast<const int*>(i_x_shape_data.data()),
+                      x_dims.size(),
+                      static_cast<const char*>(output_var_name.c_str()));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(shape,
+                         kBM,
+                         paddle::lite::subgraph::bm::ShapeConverter);
diff --git a/lite/kernels/bm/bridges/split_op.cc b/lite/kernels/bm/bridges/split_op.cc
new file mode 100755
index 0000000000000000000000000000000000000000..a95a83aa911b722240c469af12263fb583c39998
--- /dev/null
+++ b/lite/kernels/bm/bridges/split_op.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_names = op_info->Output("Out");
+  auto axis = op_info->GetAttr<int>("axis");
+  auto num = op_info->GetAttr<int>("num");
+  auto sections = op_info->GetAttr<std::vector<int>>("sections");
+  if (0 == num) {
+    num = sections.size();
+  }
+  if (0 == sections.size()) {
+    for (size_t i = 0; i < num; i++) {
+      sections.push_back(x_dims[axis] / num);
+    }
+  }
+
+  int** shape = new int*[num];
+  int* dim = new int[num];
+  const char** name = new const char*[num];
+
+  for (size_t i = 0; i < num; i++) {
+    auto out = scope->FindVar(output_names[i])->GetMutable<lite::Tensor>();
+    name[i] = static_cast<const char*>(output_names[i].c_str());
+    auto out_dims = out->dims();
+    shape[i] = new int[out_dims.size()];
+    for (size_t j = 0; j < out_dims.size(); j++) {
+      shape[i][j] = out_dims[j];
+    }
+    dim[i] = out_dims.size();
+  }
+  add_tf_split_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_x_shape_data[0]),
+                     x_dims.size(),
+                     static_cast<const char*>(x_var_name.c_str()),
+                     num,
+                     shape,
+                     dim,
+                     name,
+                     x_dims.size(),
+                     axis,
+                     const_cast<const int*>(&sections[0]),
+                     num);
+  for (size_t i = 0; i < num; i++) {
+    graph->AddNode(output_names[i]);
+    delete[] shape[i];
+  }
+  delete[] shape;
+  delete[] name;
+  delete[] dim;
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(split,
+                         kBM,
+                         paddle::lite::subgraph::bm::SplitConverter);
diff --git a/lite/kernels/bm/bridges/transpose_op.cc b/lite/kernels/bm/bridges/transpose_op.cc
index bab24a96b9920212337f6afd3c1c73f582a48975..bdd1eb651aec8e40e26b1a28e4f90eee034af04b 100644
--- a/lite/kernels/bm/bridges/transpose_op.cc
+++ b/lite/kernels/bm/bridges/transpose_op.cc
@@ -15,6 +15,7 @@
 #include <bmcompiler_defs.h>
 #include <bmcompiler_if.h>
 #include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
 
 namespace paddle {
@@ -39,11 +40,20 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   const int64_t* output_shape_data =
       const_cast<const int64_t*>(&output_dims.data()[0]);
   std::vector<int32_t> i_x_shape_data(x_dims.size());
-  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  std::vector<int32_t> i_output_shape_data(x_dims.size());
   for (size_t i = 0; i < x_dims.size(); i++) {
     i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
   }
-  for (size_t i = 0; i < output_dims.size(); i++) {
+  auto out_name = output_var_name;
+  if (x_dims.size() > output_dims.size()) {
+    for (size_t i = 0; i < (x_dims.size() - output_dims.size()); i++) {
+      i_output_shape_data[i] = 1;
+    }
+    out_name = lite::subgraph::bm::UniqueName(op_type);
+  }
+
+  for (size_t i = (x_dims.size() - output_dims.size()); i < output_dims.size();
+       i++) {
     i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
   }
   auto axis = op_info->GetAttr<std::vector<int>>("axis");
@@ -53,9 +63,22 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                          const_cast<const int*>(&i_x_shape_data[0]),
                          x_dims.size(),
                          DTYPE_FP32,
-                         static_cast<const char*>(output_var_name.c_str()),
+                         static_cast<const char*>(out_name.c_str()),
                          NULL,
                          const_cast<const int*>(&axis[0]));
+  if (x_dims.size() > output_dims.size()) {
+    std::vector<int32_t> i_real_output_shape_data(output_dims.size());
+    for (size_t i = 0; i < output_dims.size(); i++) {
+      i_real_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+    }
+    add_reshape_layer_v2(graph->GetCompilerHandle(),
+                         static_cast<const char*>(out_name.c_str()),
+                         const_cast<const int*>(&i_output_shape_data[0]),
+                         i_output_shape_data.size(),
+                         static_cast<const char*>(output_var_name.c_str()),
+                         const_cast<const int*>(&i_real_output_shape_data[0]),
+                         output_dims.size());
+  }
   graph->AddNode(output_var_name);
   return SUCCESS;
 }
diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc
index 338939f019fb8da37d0b0a234e2c8b408e5a9ad0..c6059461d1e790064407009cfc0aa3cfcdec8935 100644
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -35,7 +35,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   graph.CreateCompilerHandle();
   auto& ctx = this->ctx_->template As<BMContext>();
   for (auto& inst : origin_program_) {
-    auto op = inst.op();
+    auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
     op->InferShape();
@@ -88,18 +88,27 @@ int SubgraphEngine::BuildDeviceProgram() {
   // output
   origin_odims_.resize(output_names_.size());
   origin_otensors_.resize(output_names_.size());
-  device_outputs_.resize(output_names_.size());
-  for (size_t i = 0; i < output_names_.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(net_info_->output_names[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-    origin_otensors_[i]->mutable_data<float>();
+  device_outputs_.resize(net_info_->output_num);
+  int out_index = 0;
+  for (int i = 0; i < output_names_.size(); i++) {
+    outname_map_.insert(std::pair<std::string, int>(output_names_[i], i));
+  }
+
+  for (int i = 0; i < net_info_->output_num; i++) {
+    Tensor* t_cur = scope_->FindMutableTensor(net_info_->output_names[i]);
+    CHECK(t_cur != nullptr);
     bm_device_mem_t* p_mem =
         static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
     CHECK(p_mem != nullptr);
-    CHECK_EQ(bm_malloc_device_byte(
-                 bm_hd_, p_mem, origin_otensors_[i]->memory_size()),
-             BM_SUCCESS);
+    if (outname_map_.find(net_info_->output_names[i]) != outname_map_.end()) {
+      origin_otensors_[out_index] = t_cur;
+      origin_odims_[out_index] = origin_otensors_[out_index]->dims();
+      origin_otensors_[out_index]->mutable_data<float>();
+      out_index += 1;
+    }
+    CHECK_EQ(
+        bm_malloc_device_byte(bm_hd_, p_mem, net_info_->max_output_bytes[i]),
+        BM_SUCCESS);
     bmrt_tensor_with_device(&device_outputs_[i],
                             *p_mem,
                             net_info_->output_dtypes[i],
@@ -123,10 +132,14 @@ int SubgraphEngine::LaunchDeviceProgram() {
                         true,
                         false);
   bm_thread_sync(bm_hd_);
+  int out_index = 0;
   for (size_t i = 0; i < device_outputs_.size(); i++) {
-    bm_memcpy_d2s(bm_hd_,
-                  const_cast<void*>(origin_otensors_[i]->raw_data()),
-                  device_outputs_[i].device_mem);
+    if (outname_map_.find(net_info_->output_names[i]) != outname_map_.end()) {
+      bm_memcpy_d2s(bm_hd_,
+                    const_cast<void*>(origin_otensors_[out_index]->raw_data()),
+                    device_outputs_[i].device_mem);
+      out_index++;
+    }
   }
   return 0;
 }
diff --git a/lite/kernels/bm/subgraph_compute.h b/lite/kernels/bm/subgraph_compute.h
index ed90dc74c4dcc7e50b65d97e52783bccde8e6588..60f7661c7990d90020dbfc7ec3a6e0d178dceb70 100644
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
@@ -51,6 +51,7 @@ class SubgraphEngine : public subgraph::Engine {
   void *bmrt_hd_;
   std::vector<bm_tensor_t> device_inputs_;
   std::vector<bm_tensor_t> device_outputs_;
+  std::map<std::string, int> outname_map_;
   const char **net_names_;
   const bm_net_info_t *net_info_;
   bm_handle_t bm_hd_;
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index 3fb3136bfc0787f9d8e539039811d25559919f4e..0fb3c2ea7aa66b313411ac9d97c9918eb2ca8d2f 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -8,6 +8,8 @@ add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_de
 add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(abs_compute_cuda CUDA basic SRCS abs_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(tanh_compute_cuda CUDA basic SRCS tanh_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
@@ -45,6 +47,8 @@ lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_
 #nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
 nv_test(nearest_interp_compute_cuda_test SRCS nearest_interp_compute_test.cc DEPS nearest_interp_compute_cuda)
 nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_relu_compute_cuda)
+nv_test(abs_compute_cuda_test SRCS abs_compute_test.cc DEPS abs_compute_cuda)
+nv_test(tanh_compute_cuda_test SRCS tanh_compute_test.cc DEPS tanh_compute_cuda)
 nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda)
 nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
 nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
@@ -61,7 +65,7 @@ nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc
 #nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
 #nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
 nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
-#nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda)
+#nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda)
 #nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
 
 if(LITE_BUILD_EXTRA)
diff --git a/lite/kernels/cuda/abs_compute.cu b/lite/kernels/cuda/abs_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4f00aacc0cdcea07329e3836c7068f419d26f90c
--- /dev/null
+++ b/lite/kernels/cuda/abs_compute.cu
@@ -0,0 +1,71 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/abs_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+__global__ void AbsKernel(const int num, const T* input, T* output);
+
+template <>
+__global__ void AbsKernel<float>(const int num,
+                                 const float* input,
+                                 float* output) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+    output[index] = fabsf(input[index]);
+  }
+}
+
+template <>
+__global__ void AbsKernel<double>(const int num,
+                                  const double* input,
+                                  double* output) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+    output[index] = fabs(input[index]);
+  }
+}
+
+void AbsCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  int num = static_cast<int>(param.X->numel());
+  auto input = param.X->data<float>();
+  auto output = param.Out->mutable_data<float>(TARGET(kCUDA));
+
+  const int threads = 512;
+  const int blocks = (num + threads - 1) / threads;
+  AbsKernel<float><<<blocks, threads, 0, stream>>>(num, input, output);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    abs, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::AbsCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/abs_compute.h b/lite/kernels/cuda/abs_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1f8a0cc5ac52e01cc8ea920bdad62ef46fd0640
--- /dev/null
+++ b/lite/kernels/cuda/abs_compute.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class AbsCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+  virtual ~AbsCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/abs_compute_test.cc b/lite/kernels/cuda/abs_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bfbcae56fa51fc59c9917aa112fa5320c2759a9a
--- /dev/null
+++ b/lite/kernels/cuda/abs_compute_test.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/abs_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <memory>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+TEST(abs, normal) {
+  AbsCompute abs_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::ActivationParam param;
+
+  Tensor x, y, x_cpu, y_cpu;
+  int h = 3, w = 3;
+  y.Resize({h, w});
+  x_cpu.Resize({h, w});
+  y_cpu.Resize({h, w});
+
+  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* y_cpu_data = y_cpu.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); i++) {
+    x_cpu_data[i] = i - 1.5;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  param.X = &x;
+  param.Out = &y;
+  abs_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  abs_kernel.SetContext(std::move(ctx));
+  abs_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_cpu_data[i], std::fabs(x_cpu_data[i]), 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/elementwise_compute.cu b/lite/kernels/cuda/elementwise_compute.cu
index 64759f86f5df85f9855b9c1f186bbc9c039a044c..02b7c8f7d9e829b100e6c96aca2a8cee3ca74ef1 100644
--- a/lite/kernels/cuda/elementwise_compute.cu
+++ b/lite/kernels/cuda/elementwise_compute.cu
@@ -152,6 +152,18 @@ void ElementwiseAddComputeNHWC::Run() {
   if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
 
+void ElementwiseSubCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kSUB, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseSubComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kSUB, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
 void ElementwiseMulCompute::Run() {
   ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, false)
   cudaError_t error = cudaGetLastError();
@@ -204,6 +216,17 @@ REGISTER_LITE_KERNEL(elementwise_add,
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseSubCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(elementwise_add,
                      kCUDA,
                      kFloat,
@@ -224,6 +247,26 @@ REGISTER_LITE_KERNEL(elementwise_add,
                                        DATALAYOUT(kNHWC))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseSubComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(elementwise_mul,
                      kCUDA,
                      kFloat,
diff --git a/lite/kernels/cuda/elementwise_compute.h b/lite/kernels/cuda/elementwise_compute.h
index 986a4db2272d9a6607090babd937747f861f49c7..bc9ffd5d27c7b030f397d1b631a155cae5f34678 100644
--- a/lite/kernels/cuda/elementwise_compute.h
+++ b/lite/kernels/cuda/elementwise_compute.h
@@ -38,6 +38,24 @@ class ElementwiseAddComputeNHWC
   virtual ~ElementwiseAddComputeNHWC() = default;
 };
 
+class ElementwiseSubCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseSubCompute() = default;
+};
+
+class ElementwiseSubComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseSubComputeNHWC() = default;
+};
+
 class ElementwiseMulCompute
     : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
  public:
diff --git a/lite/kernels/cuda/tanh_compute.cu b/lite/kernels/cuda/tanh_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4f9e2729a7fa0f300308b9f1afcf35e852d11223
--- /dev/null
+++ b/lite/kernels/cuda/tanh_compute.cu
@@ -0,0 +1,56 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/tanh_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+__global__ void TanhKernel(const int num, const T* input, T* output) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+    output[index] = tanh(input[index]);
+  }
+}
+
+void TanhCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  int num = static_cast<int>(param.X->numel());
+  auto input = param.X->data<float>();
+  auto output = param.Out->mutable_data<float>(TARGET(kCUDA));
+
+  const int threads = 512;
+  const int blocks = (num + threads - 1) / threads;
+  TanhKernel<float><<<blocks, threads, 0, stream>>>(num, input, output);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    tanh, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::TanhCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/tanh_compute.h b/lite/kernels/cuda/tanh_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..b23b27882cc2ef6f5c8e15ba49fbdd5316cbfa3e
--- /dev/null
+++ b/lite/kernels/cuda/tanh_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cmath>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class TanhCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+  virtual ~TanhCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/tanh_compute_test.cc b/lite/kernels/cuda/tanh_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7bc8f25df0bed46254c56d8ec1080e45062bada2
--- /dev/null
+++ b/lite/kernels/cuda/tanh_compute_test.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/tanh_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <memory>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+TEST(tanh, fp32) {
+  TanhCompute tanh_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::ActivationParam param;
+
+  Tensor x, y, x_cpu, y_cpu;
+  int h = 3, w = 3;
+  y.Resize({h, w});
+  x_cpu.Resize({h, w});
+  y_cpu.Resize({h, w});
+
+  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* y_cpu_data = y_cpu.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); i++) {
+    x_cpu_data[i] = i - 1.5;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  param.X = &x;
+  param.Out = &y;
+  tanh_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  tanh_kernel.SetContext(std::move(ctx));
+  tanh_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_cpu_data[i], tanh(x_cpu_data[i]), 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index f337e518abd071ac262ce9ee47beae1600cc57d1..72f1914a03f0076a146c474cffcc4eddea95fb6a 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -2,9 +2,9 @@ message(STATUS "compile with lite host kernels")
 
 add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
+add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
-
-#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
-#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
+add_kernel(compare_compute_host Host extra SRCS compare_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${lite_kernel_deps})
diff --git a/lite/kernels/host/compare_compute.cc b/lite/kernels/host/compare_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f44b3edcfcf8690e67d02daf2d05040b56c53296
--- /dev/null
+++ b/lite/kernels/host/compare_compute.cc
@@ -0,0 +1,246 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/compare_compute.h"
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+#define COMPARE_FUNCTOR(name, op)                                           \
+  template <typename T>                                                     \
+  struct _##name##Functor {                                                 \
+    using TYPE = T;                                                         \
+    inline bool operator()(const T &a, const T &b) const { return a op b; } \
+  };
+
+COMPARE_FUNCTOR(Equal, ==);
+COMPARE_FUNCTOR(NotEqual, !=);
+COMPARE_FUNCTOR(LessThan, <);
+COMPARE_FUNCTOR(LessEqual, <=);
+COMPARE_FUNCTOR(GreaterThan, >);
+COMPARE_FUNCTOR(GreaterEqual, >=);
+
+template <>
+struct _EqualFunctor<float> {
+  using TYPE = float;
+  inline bool operator()(const float &a, const float &b) const {
+    // It is safe to cast a and b to double.
+    return fabs(static_cast<double>(a - b)) < 1e-8;
+  }
+};
+
+template <>
+struct _NotEqualFunctor<float> {
+  using TYPE = float;
+  inline bool operator()(const float &a, const float &b) const {
+    return !_EqualFunctor<float>()(a, b);
+  }
+};
+
+inline void get_mid_dims(const lite::DDim &x_dims,
+                         const lite::DDim &y_dims,
+                         const int axis,
+                         int *pre,
+                         int *n,
+                         int *post) {
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+
+  for (int i = 0; i < y_dims.size(); ++i) {
+    (*n) *= y_dims[i];
+  }
+
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+}
+
+template <PrecisionType PType, typename CompareFunctor>
+void CompareCompute<PType, CompareFunctor>::Run() {
+  auto &param = this->template Param<operators::CompareParam>();
+  using DType = typename CompareFunctor::TYPE;
+  const size_t x_size = param.X->numel();
+  const size_t y_size = param.Y->numel();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  bool *z = param.Out->template mutable_data<bool>();
+  const auto *x = param.X->template data<DType>();
+  const auto *y = param.Y->template data<DType>();
+  if (x_size == y_size) {
+    for (int i = 0; i < x_size; ++i) {
+      z[i] = CompareFunctor()(x[i], y[i]);
+    }
+  } else {
+    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
+    int outer_num, mid_num, inner_num;
+    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
+    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
+      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
+        auto y_data = y[mid_id];
+        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
+          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
+          z[index] = CompareFunctor()(x[index], y_data);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_EqualFunctor<float>>;
+REGISTER_LITE_KERNEL(equal, kHost, kFloat, kAny, equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using equal_int32 = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kInt32),
+    paddle::lite::kernels::host::_EqualFunctor<int32_t>>;
+REGISTER_LITE_KERNEL(equal, kHost, kInt32, kAny, equal_int32, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using not_equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_NotEqualFunctor<float>>;
+REGISTER_LITE_KERNEL(not_equal, kHost, kFloat, kAny, not_equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_than_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_LessThanFunctor<float>>;
+REGISTER_LITE_KERNEL(less_than, kHost, kFloat, kAny, less_than_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_than_int32 = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kInt32),
+    paddle::lite::kernels::host::_LessThanFunctor<int32_t>>;
+REGISTER_LITE_KERNEL(less_than, kHost, kInt32, kAny, less_than_int32, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_than_int64 = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kInt64),
+    paddle::lite::kernels::host::_LessThanFunctor<int64_t>>;
+REGISTER_LITE_KERNEL(less_than, kHost, kInt64, kAny, less_than_int64, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_LessEqualFunctor<float>>;
+REGISTER_LITE_KERNEL(less_equal, kHost, kFloat, kAny, less_equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using greater_than_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_GreaterThanFunctor<float>>;
+REGISTER_LITE_KERNEL(greater_than, kHost, kFloat, kAny, greater_than_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using greater_equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_GreaterEqualFunctor<float>>;
+REGISTER_LITE_KERNEL(
+    greater_equal, kHost, kFloat, kAny, greater_equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
diff --git a/lite/kernels/host/compare_compute.h b/lite/kernels/host/compare_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1166c6c7092955087a1bcf618c287f2c67fdd9a
--- /dev/null
+++ b/lite/kernels/host/compare_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <PrecisionType PType, typename CompareFunctor>
+class CompareCompute
+    : public KernelLite<TARGET(kHost), PType, DATALAYOUT(kAny)> {
+ public:
+  void Run() override;
+
+  virtual ~CompareCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/ctc_align_compute.cc b/lite/kernels/host/ctc_align_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a62c2ee15ac2752d5d3349fbaaeb18f31ac4c5a0
--- /dev/null
+++ b/lite/kernels/host/ctc_align_compute.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/ctc_align_compute.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+LoD ToAbs(const LoD& in) {
+  if (in.empty()) return in;
+  LoD result;
+  for (auto& src : in) {
+    std::vector<uint64_t> dest(src.size() + 1, 0);
+    for (int i = 0; i < src.size(); i++) {
+      dest[i + 1] = dest[i] + src[i];
+    }
+    result.emplace_back(dest);
+  }
+  return result;
+}
+
+LoD ToNorm(const LoD& in) {
+  if (in.empty()) return in;
+  LoD result;
+  for (auto& src : in) {
+    std::vector<uint64_t> dest(src.size() - 1, 0);
+    for (int i = 0; i < dest.size(); i++) {
+      dest[i] = src[i + 1] - src[i];
+    }
+    result.emplace_back(dest);
+  }
+  return result;
+}
+
+LoD ToAbsOffset(const LoD& in) {
+  // the lowest level stores relative offsets
+  if (in.empty() || in.size() == 1) return in;
+  LoD result = in;
+  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
+    for (size_t i = 0; i < in[level].size(); ++i) {
+      size_t index = in[level][i];
+      result[level][i] = result[level + 1][index];
+    }
+  }
+  return result;
+}
+
+template <typename T, PrecisionType PT>
+void CtcAlignCompute<T, PT>::Run() {
+  auto& param = this->template Param<operators::CtcAlignParam>();
+  auto* input = param.input;
+  auto* output = param.output;
+  size_t blank = static_cast<size_t>(param.blank);
+  bool merge_repeated = param.merge_repeated;
+  size_t padding_value = static_cast<size_t>(param.padding_value);
+
+  const auto* input_data = input->template data<T>();
+  auto input_dims = input->dims();
+  auto* output_data = output->template mutable_data<T>();
+
+  if (input->lod().empty()) {
+    auto* input_length = param.input_length;
+    auto* output_length = param.output_length;
+    CHECK(input_length != nullptr);
+    CHECK(output_length != nullptr);
+    const auto* input_length_data = input_length->template data<T>();
+    auto* output_length_data = output_length->template mutable_data<T>();
+
+    for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0]; batch_id++) {
+      T prev_token = -1;
+      size_t output_idx = 0;
+      for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) {
+        size_t input_ind = batch_id * input_dims[1] + i;
+        if ((unsigned)input_data[input_ind] != blank &&
+            !(merge_repeated && input_data[input_ind] == prev_token)) {
+          output_data[batch_id * input_dims[1] + output_idx] =
+              input_data[input_ind];
+          ++output_idx;
+        }
+        prev_token = input_data[input_ind];
+      }
+      output_length_data[batch_id] = output_idx;
+      for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++)
+        output_data[batch_id * input_dims[1] + j] = padding_value;
+    }
+  } else {
+    const size_t level = 0;
+
+    auto input_lod = input->lod();
+    input_lod = ToAbs(input->lod());
+    input_lod = ToAbsOffset(input_lod);
+    CHECK_EQ(input_dims[0], static_cast<int64_t>(input_lod[level].back()));
+
+    const size_t num_sequences = input_lod[level].size() - 1;
+    // merge repeated tokens and delete blank
+    size_t output_idx = 0;
+    std::vector<uint64_t> output_lod0(1, 0);
+    for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
+      T prev_token = -1;
+      for (size_t i = input_lod[level][seq_idx];
+           i < input_lod[level][seq_idx + 1];
+           ++i) {
+        if ((unsigned)input_data[i] != blank &&
+            !(merge_repeated && input_data[i] == prev_token)) {
+          output_data[output_idx] = input_data[i];
+          ++output_idx;
+        }
+        prev_token = input_data[i];
+      }
+      output_lod0.push_back(static_cast<uint64_t>(output_idx));
+    }
+
+    LoD output_lod;
+    output_lod.push_back(output_lod0);
+    output_lod = ToNorm(output_lod);
+    output->set_lod(output_lod);
+    output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+    if (output_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output_data = output->template mutable_data<T>();
+      output_data[0] = -1;
+    }
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+using ctc_align_int64 =
+    paddle::lite::kernels::host::CtcAlignCompute<int64_t, PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(ctc_align, kHost, kInt64, kNCHW, ctc_align_int64, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindInput("InputLength",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindOutput("OutputLength",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .Finalize();
+
+using ctc_align_int32 =
+    paddle::lite::kernels::host::CtcAlignCompute<int32_t, PRECISION(kInt32)>;
+REGISTER_LITE_KERNEL(ctc_align, kHost, kInt32, kNCHW, ctc_align_int32, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("InputLength",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("OutputLength",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .Finalize();
diff --git a/lite/kernels/host/ctc_align_compute.h b/lite/kernels/host/ctc_align_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..737fb3be6c96d91a3cde4a8f9053c6f7b9c7ec69
--- /dev/null
+++ b/lite/kernels/host/ctc_align_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <typename T, PrecisionType PT>
+class CtcAlignCompute : public KernelLite<TARGET(kHost), PT> {
+ public:
+  void Run() override;
+
+  virtual ~CtcAlignCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/multiclass_nms_compute.cc b/lite/kernels/host/multiclass_nms_compute.cc
index 17c8cfd9fe0e40c59441b40d29f7803d5e8aa3fe..5a09fca72b4bb30ac67b1186cf90c58a5f9a1dd4 100644
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
@@ -92,6 +92,7 @@ T PolyIoU(const T* box1,
           const size_t box_size,
           const bool normalized) {
   LOG(FATAL) << "PolyIoU not implement.";
+  return *box1;
 }
 
 template <class T>
@@ -369,7 +370,7 @@ void MulticlassNmsCompute::Run() {
     }
   } else {
     outs->Resize({static_cast<int64_t>(num_kept), out_dim});
-    (void)outs->mutable_data<float>();
+    outs->mutable_data<float>();
     int offset = 0;
     int* oindices = nullptr;
     for (int i = 0; i < n; ++i) {
diff --git a/lite/kernels/host/multiclass_nms_compute_test.cc b/lite/kernels/host/multiclass_nms_compute_test.cc
deleted file mode 100644
index 83fb717042515a7a06fe0c014fca7482ad6c8684..0000000000000000000000000000000000000000
--- a/lite/kernels/host/multiclass_nms_compute_test.cc
+++ /dev/null
@@ -1,368 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/host/multiclass_nms_compute.h"
-#include <gtest/gtest.h>
-#include <map>
-#include <utility>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-template <typename dtype>
-static bool sort_score_pair_descend(const std::pair<float, dtype>& pair1,
-                                    const std::pair<float, dtype>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <typename dtype>
-void get_max_score_index(const dtype* scores,
-                         int num,
-                         float threshold,
-                         int top_k,
-                         std::vector<std::pair<dtype, int>>* score_index_vec) {
-  //! Generate index score pairs.
-  for (int i = 0; i < num; ++i) {
-    if (scores[i] > threshold) {
-      score_index_vec->push_back(std::make_pair(scores[i], i));
-    }
-  }
-
-  //! Sort the score pair according to the scores in descending order
-  std::stable_sort(score_index_vec->begin(),
-                   score_index_vec->end(),
-                   sort_score_pair_descend<int>);
-
-  //! Keep top_k scores if needed.
-  if (top_k > -1 && top_k < score_index_vec->size()) {
-    score_index_vec->resize(top_k);
-  }
-}
-
-template <typename dtype>
-dtype bbox_size(const dtype* bbox, bool normalized = true) {
-  if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) {
-    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
-    return dtype(0.);
-  } else {
-    const dtype width = bbox[2] - bbox[0];
-    const dtype height = bbox[3] - bbox[1];
-
-    if (normalized) {
-      return width * height;
-    } else {
-      // If bbox is not within range [0, 1].
-      return (width + 1) * (height + 1);
-    }
-  }
-}
-
-template <typename dtype>
-dtype jaccard_overlap(const dtype* bbox1, const dtype* bbox2) {
-  if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] || bbox2[1] > bbox1[3] ||
-      bbox2[3] < bbox1[1]) {
-    return dtype(0.);
-  } else {
-    const dtype inter_xmin = std::max(bbox1[0], bbox2[0]);
-    const dtype inter_ymin = std::max(bbox1[1], bbox2[1]);
-    const dtype inter_xmax = std::min(bbox1[2], bbox2[2]);
-    const dtype inter_ymax = std::min(bbox1[3], bbox2[3]);
-
-    const dtype inter_width = inter_xmax - inter_xmin;
-    const dtype inter_height = inter_ymax - inter_ymin;
-    const dtype inter_size = inter_width * inter_height;
-
-    const dtype bbox1_size = bbox_size(bbox1);
-    const dtype bbox2_size = bbox_size(bbox2);
-
-    return inter_size / (bbox1_size + bbox2_size - inter_size);
-  }
-}
-
-template <typename dtype>
-void apply_nms_fast(const dtype* bboxes,
-                    const dtype* scores,
-                    int num,
-                    float score_threshold,
-                    float nms_threshold,
-                    float eta,
-                    int top_k,
-                    std::vector<int>* indices) {
-  // Get top_k scores (with corresponding indices).
-  std::vector<std::pair<dtype, int>> score_index_vec;
-  get_max_score_index(scores, num, score_threshold, top_k, &score_index_vec);
-
-  // Do nms.
-  float adaptive_threshold = nms_threshold;
-  indices->clear();
-
-  while (score_index_vec.size() != 0) {
-    const int idx = score_index_vec.front().second;
-    bool keep = true;
-
-    for (int k = 0; k < indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*indices)[k];
-        float overlap =
-            jaccard_overlap(bboxes + idx * 4, bboxes + kept_idx * 4);
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-
-    if (keep) {
-      indices->push_back(idx);
-    }
-
-    score_index_vec.erase(score_index_vec.begin());
-
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-}
-
-template <typename dtype>
-void multiclass_nms_compute_ref(const operators::MulticlassNmsParam& param,
-                                int class_num,
-                                const std::vector<int>& priors,
-                                bool share_location,
-                                std::vector<float>* result) {
-  int background_id = param.background_label;
-  int keep_topk = param.keep_top_k;
-  int nms_topk = param.nms_top_k;
-  float conf_thresh = param.score_threshold;
-  float nms_thresh = param.nms_threshold;
-  float nms_eta = param.nms_eta;
-  const dtype* bbox_data = param.bboxes->data<const dtype>();
-  const dtype* conf_data = param.scores->data<const dtype>();
-  dtype* out = param.out->mutable_data<dtype>();
-  (*result).clear();
-
-  int num_kept = 0;
-  std::vector<std::map<int, std::vector<int>>> all_indices;
-  int64_t conf_offset = 0;
-  int64_t bbox_offset = 0;
-  for (int i = 0; i < priors.size(); ++i) {
-    std::map<int, std::vector<int>> indices;
-    int num_det = 0;
-    int num_priors = priors[i];
-
-    int conf_idx = class_num * conf_offset;
-    int bbox_idx =
-        share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
-
-    for (int c = 0; c < class_num; ++c) {
-      if (c == background_id) {
-        // Ignore background class
-        continue;
-      }
-
-      const dtype* cur_conf_data = conf_data + conf_idx + c * num_priors;
-      const dtype* cur_bbox_data = bbox_data + bbox_idx;
-
-      if (!share_location) {
-        cur_bbox_data += c * num_priors * 4;
-      }
-
-      apply_nms_fast(cur_bbox_data,
-                     cur_conf_data,
-                     num_priors,
-                     conf_thresh,
-                     nms_thresh,
-                     nms_eta,
-                     nms_topk,
-                     &(indices[c]));
-      num_det += indices[c].size();
-    }
-
-    if (keep_topk > -1 && num_det > keep_topk) {
-      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-
-      for (auto it = indices.begin(); it != indices.end(); ++it) {
-        int label = it->first;
-        const std::vector<int>& label_indices = it->second;
-
-        for (int j = 0; j < label_indices.size(); ++j) {
-          int idx = label_indices[j];
-          float score = conf_data[conf_idx + label * num_priors + idx];
-          score_index_pairs.push_back(
-              std::make_pair(score, std::make_pair(label, idx)));
-        }
-      }
-
-      // Keep top k results per image.
-      std::stable_sort(score_index_pairs.begin(),
-                       score_index_pairs.end(),
-                       sort_score_pair_descend<std::pair<int, int>>);
-      score_index_pairs.resize(keep_topk);
-      // Store the new indices.
-      std::map<int, std::vector<int>> new_indices;
-
-      for (int j = 0; j < score_index_pairs.size(); ++j) {
-        int label = score_index_pairs[j].second.first;
-        int idx = score_index_pairs[j].second.second;
-        new_indices[label].push_back(idx);
-      }
-
-      all_indices.push_back(new_indices);
-      num_kept += keep_topk;
-    } else {
-      all_indices.push_back(indices);
-      num_kept += num_det;
-    }
-    conf_offset += num_priors;
-    bbox_offset += num_priors;
-  }
-
-  if (num_kept == 0) {
-    (*result).clear();
-    (*result).resize(1);
-    (*result)[0] = -1;
-    return;
-  } else {
-    (*result).resize(num_kept * 6);
-  }
-
-  int count = 0;
-
-  conf_offset = 0;
-  bbox_offset = 0;
-  for (int i = 0; i < priors.size(); ++i) {
-    int num_priors = priors[i];
-    int conf_idx = class_num * conf_offset;
-    int bbox_idx =
-        share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
-
-    for (auto it = all_indices[i].begin(); it != all_indices[i].end(); ++it) {
-      int label = it->first;
-      std::vector<int>& indices = it->second;
-      const dtype* cur_conf_data = conf_data + conf_idx + label * num_priors;
-      const dtype* cur_bbox_data = bbox_data + bbox_idx;
-
-      if (!share_location) {
-        cur_bbox_data += label * num_priors * 4;
-      }
-
-      for (int j = 0; j < indices.size(); ++j) {
-        int idx = indices[j];
-        (*result)[count * 6] = label;
-        (*result)[count * 6 + 1] = cur_conf_data[idx];
-
-        for (int k = 0; k < 4; ++k) {
-          (*result)[count * 6 + 2 + k] = cur_bbox_data[idx * 4 + k];
-        }
-
-        ++count;
-      }
-    }
-    conf_offset += num_priors;
-    bbox_offset += num_priors;
-  }
-}
-
-TEST(multiclass_nms_host, init) {
-  MulticlassNmsCompute multiclass_nms;
-  ASSERT_EQ(multiclass_nms.precision(), PRECISION(kFloat));
-  ASSERT_EQ(multiclass_nms.target(), TARGET(kHost));
-}
-
-TEST(multiclass_nms_host, retrive_op) {
-  auto multiclass_nms =
-      KernelRegistry::Global().Create<TARGET(kHost), PRECISION(kFloat)>(
-          "multiclass_nms");
-  ASSERT_FALSE(multiclass_nms.empty());
-  ASSERT_TRUE(multiclass_nms.front());
-}
-
-TEST(multiclass_nms_host, compute) {
-  MulticlassNmsCompute multiclass_nms;
-  operators::MulticlassNmsParam param;
-  lite::Tensor bbox, conf, out;
-  std::vector<float> out_ref;
-
-  for (std::vector<int> priors : {std::vector<int>({2, 2, 2})}) {
-    int N = priors.size();
-    for (bool share_location : {true}) {
-      for (int class_num : {1, 4, 10}) {
-        DDim* bbox_dim;
-        DDim* conf_dim;
-        int M = priors[0];
-        if (share_location) {
-          bbox_dim = new DDim({N, M, 4});
-        } else {
-          bbox_dim = new DDim({class_num, M, 4});
-        }
-        conf_dim = new DDim({N, class_num, M});
-        bbox.Resize(*bbox_dim);
-        conf.Resize(*conf_dim);
-        for (int background_id : {0}) {
-          for (int keep_topk : {1, 5, 10}) {
-            for (int nms_topk : {1, 5, 10}) {
-              for (float nms_eta : {1.0, 0.99, 0.9}) {
-                for (float nms_thresh : {0.5, 0.7}) {
-                  for (float conf_thresh : {0.5, 0.7}) {
-                    auto* conf_data = conf.mutable_data<float>();
-                    auto* bbox_data = bbox.mutable_data<float>();
-                    for (int i = 0; i < bbox_dim->production(); ++i) {
-                      bbox_data[i] = i * 1. / bbox_dim->production();
-                    }
-                    for (int i = 0; i < conf_dim->production(); ++i) {
-                      conf_data[i] = i * 1. / conf_dim->production();
-                    }
-                    param.bboxes = &bbox;
-                    param.scores = &conf;
-                    param.out = &out;
-                    param.background_label = background_id;
-                    param.keep_top_k = keep_topk;
-                    param.nms_top_k = nms_topk;
-                    param.score_threshold = conf_thresh;
-                    param.nms_threshold = nms_thresh;
-                    param.nms_eta = nms_eta;
-                    multiclass_nms.SetParam(param);
-                    multiclass_nms.Run();
-                    auto* out_data = out.mutable_data<float>();
-                    out_ref.clear();
-                    multiclass_nms_compute_ref<float>(
-                        param, class_num, priors, share_location, &out_ref);
-                    EXPECT_EQ(out.dims().production(), out_ref.size());
-                    if (out.dims().production() == out_ref.size()) {
-                      auto* out_ref_data = out_ref.data();
-                      for (int i = 0; i < out.dims().production(); i++) {
-                        EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-        delete bbox_dim;
-        delete conf_dim;
-      }
-    }
-  }
-}
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
diff --git a/lite/kernels/host/reshape_compute_test.cc b/lite/kernels/host/reshape_compute_test.cc
deleted file mode 100644
index e09da816469eb3bd8d3505de5cb9dc3d451a527d..0000000000000000000000000000000000000000
--- a/lite/kernels/host/reshape_compute_test.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/host/reshape_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-TEST(reshape_host, init) {
-  ReshapeCompute reshape;
-  ASSERT_EQ(reshape.precision(), PRECISION(kAny));
-  ASSERT_EQ(reshape.target(), TARGET(kHost));
-}
-
-TEST(reshape_host, compute) {
-  ReshapeCompute reshape;
-  operators::ReshapeParam param;
-
-  Tensor input;
-  Tensor output;
-  input.Resize({1, 2, 4, 6});
-  auto* input_data = input.mutable_data<float>();
-  for (int i = 0; i < input.numel(); i++) {
-    input_data[i] = i;
-  }
-  Tensor shape_tensor;
-  shape_tensor.Resize({2});
-  auto* shape_tensor_data = shape_tensor.mutable_data<int>();
-  shape_tensor_data[0] = 6;
-  shape_tensor_data[1] = 8;
-
-  // set param and run
-  param.x = &input;
-  param.shape_tensor = &shape_tensor;  // use shape_tensor
-  param.inplace = false;
-  param.output = &output;
-  reshape.SetParam(param);
-  reshape.Run();
-
-  // check output dims
-  CHECK_EQ(shape_tensor.numel(), output.numel());
-  for (int i = 0; i < output.dims().size(); i++) {
-    CHECK_EQ(output.dims()[i], shape_tensor_data[i]);
-  }
-
-  // check output data
-  auto* output_data = output.mutable_data<float>();
-  CHECK_NE(output_data, input_data);
-  for (int i = 0; i < output.numel(); i++) {
-    EXPECT_NEAR(output_data[i], input_data[i], 1e-6);
-  }
-
-  // use shape, set param and run
-  param.shape_tensor = nullptr;
-  param.shape_vct = {-1, 0, 3, 2, 1};
-  reshape.SetParam(param);
-  reshape.Run();
-
-  // check output dims
-  CHECK_EQ(shape_tensor.numel(), output.numel());
-  for (int i = 0; i < output.dims().size(); i++) {
-    CHECK_EQ(output.dims()[i], shape_tensor_data[i]);
-  }
-
-  // check output data
-  output_data = output.mutable_data<float>();
-  CHECK_NE(output_data, input_data);
-  for (int i = 0; i < output.numel(); i++) {
-    EXPECT_NEAR(output_data[i], input_data[i], 1e-6);
-  }
-
-  // check output data if inplace = true;
-  param.inplace = true;
-  reshape.SetParam(param);
-  reshape.Run();
-  output_data = output.mutable_data<float>();
-  CHECK_EQ(output_data, input_data);
-}
-
-TEST(reshape, retrive_op) {
-  auto reshape =
-      KernelRegistry::Global()
-          .Create<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)>("reshape");
-  ASSERT_FALSE(reshape.empty());
-  ASSERT_TRUE(reshape.front());
-}
-
-TEST(reshape2, retrive_op) {
-  auto reshape2 =
-      KernelRegistry::Global()
-          .Create<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)>("reshape2");
-  ASSERT_FALSE(reshape2.empty());
-  ASSERT_TRUE(reshape2.front());
-}
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
diff --git a/lite/kernels/arm/shape_compute.cc b/lite/kernels/host/shape_compute.cc
similarity index 70%
rename from lite/kernels/arm/shape_compute.cc
rename to lite/kernels/host/shape_compute.cc
index 3928e845023dd10c66704e1d752d2e5d2d7a5aff..83060cf4810447dfef20cf01ec3e1499e47e127b 100644
--- a/lite/kernels/arm/shape_compute.cc
+++ b/lite/kernels/host/shape_compute.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/shape_compute.h"
-#include "lite/backends/arm/math/funcs.h"
+#include "lite/kernels/host/shape_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 void ShapeCompute::Run() {
   auto& param = Param<operators::ShapeParam>();
@@ -29,13 +28,17 @@ void ShapeCompute::Run() {
   }
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(
-    shape, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ShapeCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    shape, kHost, kAny, kAny, paddle::lite::kernels::host::ShapeCompute, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
     .Finalize();
diff --git a/lite/kernels/arm/shape_compute.h b/lite/kernels/host/shape_compute.h
similarity index 87%
rename from lite/kernels/arm/shape_compute.h
rename to lite/kernels/host/shape_compute.h
index 267df75624bf3381dba47c38c3e19bb07d0bb7e9..f11b79cddde8e8c546cff720b5b19cc085a06c3c 100644
--- a/lite/kernels/arm/shape_compute.h
+++ b/lite/kernels/host/shape_compute.h
@@ -19,16 +19,17 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
-class ShapeCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class ShapeCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
  public:
   void Run() override;
 
   virtual ~ShapeCompute() = default;
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h
index c4fb10bdb581b97d42e16ec6c9cce38465a84f93..80d3e3bcf1f075701df44f3843ea9e27001575dd 100644
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -120,8 +120,7 @@ class Graph {
       output_addrs_[i] = output_tensors_[i]->mlu_data();
     }
 
-#if PRINT_HW_TIME
-    thread_local float hw_time;
+#if PRINT_HW_TIME thread_local float hw_time;
     CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
 #endif
     CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h
index 00ec78a0b4176afe87d2b4af8bbf9d34973e5a2a..7a4328b3b33851f8575661a79c07706bc5c2351e 100644
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -84,6 +84,32 @@ class SubgraphEngine : public subgraph::Engine {
     return true;
   }
 
+  int Build() {
+    // In order to attach all of the ops of the block desc, we need to build
+    // the original program firstly.
+    BuildOriginProgram();
+    // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
+    build_device_program_status_ = BuildDeviceProgram();
+    return build_device_program_status_;
+  }
+
+  int Launch() {
+    // Rebuild device program when the shapes of input tensors have been
+    // changed.
+    if (subgraph::CHECK_SUCCESS(build_device_program_status_) &&
+        subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED(
+            build_device_program_status_) &&
+        InputShapeChanged()) {
+      Build();
+    }
+    if (subgraph::CHECK_FAILED(build_device_program_status_)) {
+      LaunchOriginProgram();
+    } else {
+      LaunchDeviceProgram();
+    }
+    return 0;
+  }
+
  protected:
   int BuildDeviceProgram() override {
     int status = 0;
@@ -119,7 +145,7 @@ class SubgraphEngine : public subgraph::Engine {
       CHECK(op);
       std::string op_type = op->op_info()->Type();
       op->CheckShape();
-      op->InferShape();
+      const_cast<OpLite*>(op)->InferShape();
       if (!bridges.Exists(op_type, TARGET(kMLU))) {
         LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
         return subgraph::FAILED;
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index 3c5c65d77d1615b15879f2561c2ac1fa2b10c5cd..68ccba569ad0726bdda4c15840c7680b51ba0c58 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU)
   return()
 endif()
 
@@ -38,6 +38,8 @@ lite_cc_library(subgraph_bridge_shuffle_channel_op_npu SRCS shuffle_channel_op.c
 lite_cc_library(subgraph_bridge_pad2d_op_npu SRCS pad2d_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_reduce_mean_op_npu SRCS reduce_mean_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_gather_op_npu SRCS gather_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_lookup_table_op_npu SRCS lookup_table_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_instance_norm_op_npu SRCS instance_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_dropout_op_npu SRCS dropout_op.cc DEPS ${npu_subgraph_bridge_deps})
@@ -47,6 +49,7 @@ lite_cc_library(subgraph_bridge_fill_constant_op_npu SRCS fill_constant_op.cc DE
 lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_npu SRCS fill_constant_batch_size_like_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_increment_op_npu SRCS increment_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_compare_op_npu SRCS compare_op.cc DEPS ${npu_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_shape_op_npu SRCS shape_op.cc DEPS ${npu_subgraph_bridge_deps})
 
 
 set(npu_subgraph_bridges
@@ -73,6 +76,8 @@ set(npu_subgraph_bridges
         subgraph_bridge_pad2d_op_npu
         subgraph_bridge_reduce_mean_op_npu
         subgraph_bridge_unsqueeze_op_npu
+        subgraph_bridge_gather_op_npu
+        subgraph_bridge_lookup_table_op_npu
         subgraph_bridge_argmax_op_npu
         subgraph_bridge_instance_norm_op_npu
         subgraph_bridge_dropout_op_npu
diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc
index 637b6eea5c99f9ab2a43d4bd442a3a720dced96a..f21e5618b0d8b2e0e7ed4aec0b1bc9b16c4877d9 100644
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
@@ -220,6 +220,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     act_op->set_attr_mode(CvtActMode(act_type));
     if (act_type == "leaky_relu") {
       act_op->set_attr_negative_slope(leaky_relu_alpha);
+    } else if (act_type == "relu6") {
+      act_op->set_attr_coef(6.f);
     }
   }
 
diff --git a/lite/kernels/npu/bridges/gather_op.cc b/lite/kernels/npu/bridges/gather_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08b165f02b4d355336bc8bc094a2ef309f24d48d
--- /dev/null
+++ b/lite/kernels/npu/bridges/gather_op.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+
+  auto index_name = op_info->Input("Index").front();
+  auto index = scope->FindTensor(index_name);
+  auto index_dims = index->dims();
+  CHECK(index_dims.size() == 1 ||
+        (index_dims.size() == 2 && index_dims[1] == 1))
+      << "index dims unmatch";
+
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Index node
+  std::shared_ptr<Node> index_node = nullptr;
+  if (graph->Has(index_name)) {
+    index_node = graph->Get(index_name);
+  } else {
+    index_node = graph->Add(index_name, *index);
+  }
+
+  // Gather node
+  auto gather_node = graph->Add<ge::op::Gather>(out_name);
+  auto gather_op = gather_node->data<ge::op::Gather>();
+  gather_op->set_input_params(*x_node->data());
+  gather_op->set_input_indices(*index_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(gather,
+                         kNPU,
+                         paddle::lite::subgraph::npu::GatherConverter);
diff --git a/lite/kernels/npu/bridges/lookup_table_op.cc b/lite/kernels/npu/bridges/lookup_table_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7356cde339d94cf9319f247aa0a7355c7ce4f5fd
--- /dev/null
+++ b/lite/kernels/npu/bridges/lookup_table_op.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto w_name = op_info->Input("W").front();
+  auto w = scope->FindTensor(w_name);
+
+  auto index_name = op_info->Input("Ids").front();
+  auto index = scope->FindTensor(index_name);
+
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindTensor(out_name);
+  auto out_shape = out->dims().Vectorize();
+
+  // W node
+  std::shared_ptr<Node> w_node = nullptr;
+  if (graph->Has(w_name)) {
+    w_node = graph->Get(w_name);
+  } else {
+    w_node = graph->Add(w_name, *w);
+  }
+
+  // Index node
+  std::shared_ptr<Node> index_node = nullptr;
+  if (graph->Has(index_name)) {
+    index_node = graph->Get(index_name);
+  } else {
+    index_node = graph->Add(index_name, *index);
+  }
+
+  // reshape ids
+  auto reshaped_index_node =
+      graph->Add<ge::op::Reshape>(index_name + "/reshape");
+  auto reshaped_index_op = reshaped_index_node->data<ge::op::Reshape>();
+  reshaped_index_op->set_input_tensor(*index_node->data());
+  reshaped_index_op->set_attr_shape(ge::AttrValue::LIST_INT({index->numel()}));
+  reshaped_index_op->set_attr_axis(0);
+  index_node = reshaped_index_node;
+
+  // Gather node
+  auto gather_node = graph->Add<ge::op::Gather>(out_name);
+  auto gather_op = gather_node->data<ge::op::Gather>();
+  gather_op->set_input_params(*w_node->data());
+  gather_op->set_input_indices(*index_node->data());
+
+  // reshape out
+  auto reshaped_gather_node = graph->Add<ge::op::Reshape>(out_name);
+  auto reshaped_gather_op = reshaped_gather_node->data<ge::op::Reshape>();
+  reshaped_gather_op->set_input_tensor(*gather_node->data());
+  reshaped_gather_op->set_attr_shape(
+      ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+  reshaped_gather_op->set_attr_axis(0);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(lookup_table,
+                         kNPU,
+                         paddle::lite::subgraph::npu::LookupTableConverter);
diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h
index 6c406302212640ec41d0701f530c0c1f32229539..5ec7591453f8d9e9ec179856cc21900147236bc3 100644
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -18,6 +18,7 @@ USE_SUBGRAPH_BRIDGE(sigmoid, kNPU);
 USE_SUBGRAPH_BRIDGE(relu, kNPU);
 USE_SUBGRAPH_BRIDGE(tanh, kNPU);
 USE_SUBGRAPH_BRIDGE(relu_clipped, kNPU);
+USE_SUBGRAPH_BRIDGE(relu6, kNPU);
 USE_SUBGRAPH_BRIDGE(leaky_relu, kNPU);
 USE_SUBGRAPH_BRIDGE(softsign, kNPU);
 USE_SUBGRAPH_BRIDGE(hard_sigmoid, kNPU);
@@ -44,6 +45,8 @@ USE_SUBGRAPH_BRIDGE(fusion_elementwise_div_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fill_constant, kNPU)
 USE_SUBGRAPH_BRIDGE(fill_constant_batch_size_like, kNPU)
 
+// USE_SUBGRAPH_BRIDGE(gather, kNPU);
+// USE_SUBGRAPH_BRIDGE(lookup_table, kNPU);
 USE_SUBGRAPH_BRIDGE(increment, kNPU);
 USE_SUBGRAPH_BRIDGE(instance_norm, kNPU);
 USE_SUBGRAPH_BRIDGE(fc, kNPU);
@@ -58,6 +61,7 @@ USE_SUBGRAPH_BRIDGE(reduce_mean, kNPU);
 USE_SUBGRAPH_BRIDGE(reshape, kNPU);
 USE_SUBGRAPH_BRIDGE(reshape2, kNPU);
 USE_SUBGRAPH_BRIDGE(scale, kNPU);
+// USE_SUBGRAPH_BRIDGE(shape, kNPU);
 USE_SUBGRAPH_BRIDGE(shuffle_channel, kNPU);
 USE_SUBGRAPH_BRIDGE(softmax, kNPU);
 USE_SUBGRAPH_BRIDGE(split, kNPU);
diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc
index e30a286961c376ac94de78a5a8f9f8a776af062a..51f67a1c6f0122c1140aeb762b448a928bd16692 100644
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -99,10 +99,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  ksize);
 
   // ceil mode
-  int ceil_mode = 0;
-  if (op_info->HasAttr("ceil_mode")) {
-    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
-  }
+  bool ceil_mode =
+      op_info->HasAttr("ceil_mode") && op_info->GetAttr<bool>("ceil_mode");
 
   // Pooling node
   auto pool_node = graph->Add<ge::op::Pooling>(out_name);
@@ -112,12 +110,14 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   pool_op->set_attr_pad_mode(pad_mode);
   pool_op->set_attr_global_pooling(global_pooling);
   pool_op->set_attr_window(ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()));
-  pool_op->set_attr_pad(ge::AttrValue::LIST_INT{
-      paddings[0], paddings[1], paddings[2], paddings[3]});
+  pool_op->set_attr_pad(
+      ge::AttrValue::LIST_INT(paddings.begin(), paddings.end()));
   pool_op->set_attr_stride(
       ge::AttrValue::LIST_INT(strides.begin(), strides.end()));
-  pool_op->set_attr_ceil_mode(ceil_mode);
-  // pool_op->set_attr_data_mode(data_mode);
+  if (ceil_mode) {
+    pool_op->set_attr_ceil_mode(1);
+    pool_op->set_attr_data_mode(0);
+  }
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
diff --git a/lite/kernels/npu/bridges/registry.h b/lite/kernels/npu/bridges/registry.h
index 0694723754dff48ba92081b01ec9ed5e2ab8c4cf..9164c41090e6d4906a522d99a78bfadb1b143f17 100644
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
@@ -58,14 +58,6 @@ class Registry {
 }  // namespace lite
 }  // namespace paddle
 
-// some platform-independent defintion
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
 #define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg)         \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
diff --git a/lite/kernels/npu/bridges/shape_op.cc b/lite/kernels/npu/bridges/shape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bac0232720107207d0be65e9a31bc6a5a9380f0
--- /dev/null
+++ b/lite/kernels/npu/bridges/shape_op.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int ShapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("Input").front();
+  auto x = scope->FindTensor(x_name);
+
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Shape node
+  auto shape_node = graph->Add<ge::op::Shape>(out_name);
+  auto shape_op = shape_node->data<ge::op::Shape>();
+  shape_op->set_input_x(*x_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(shape,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ShapeConverter);
diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc
index d7b14a9319951eb827cbc9d346ee8e59e9571aee..1baa5a0de44d71356cabd505fb0cdfe388a0bae3 100644
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -35,7 +35,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   subgraph::npu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
   for (auto& inst : origin_program_) {
-    auto op = inst.op();
+    auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
     op->InferShape();
@@ -44,10 +44,8 @@ int SubgraphEngine::BuildDeviceProgram() {
       return subgraph::FAILED;
     }
     auto kernel = inst.kernel();
-    status |=
-        bridges.Select(op_type, TARGET(kNPU))(reinterpret_cast<void*>(&graph),
-                                              const_cast<OpLite*>(op),
-                                              const_cast<KernelBase*>(kernel));
+    status |= bridges.Select(op_type, TARGET(kNPU))(
+        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
       return subgraph::FAILED;
     }
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index 652ce2593828c5131c0e3192db0a45a490b3cbc6..d9fae3d48efb1eab2681338b02afa2fee65750b6 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -33,7 +33,7 @@ add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kerne
 add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
-
+add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
 # extra
 # wait to add ...
 
@@ -67,8 +67,8 @@ lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
 lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
              DEPS concat_opencl layout_opencl op_registry program context)
 
-lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc
-             DEPS elementwise_mul_opencl op_registry program context)
+#lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc
+#             DEPS elementwise_mul_opencl op_registry program context)
 
 lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
              DEPS layout_opencl op_registry program context)
@@ -89,14 +89,18 @@ lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_comput
 lite_cc_test(test_slice_image_opencl SRCS slice_image_compute_test.cc
 	         DEPS slice_opencl op_registry program context)
              
-lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
-                 DEPS instance_norm_opencl op_registry program context)             
+	     #lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
+	     #             DEPS instance_norm_opencl op_registry program context)
 
 lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc
                  DEPS dropout_opencl op_registry program context)  
                  
 lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc
                  DEPS pad2d_opencl layout_opencl op_registry program context)
+
+lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
+                 DEPS box_coder_opencl op_registry program context)
+
 ######################
 # buffer kernel      #
 ######################
diff --git a/lite/kernels/opencl/activation_buffer_compute.cc b/lite/kernels/opencl/activation_buffer_compute.cc
index c662aa89fb257aded70119ea14494111398f0529..7ca2c663d1cfd360a72a176182adcaa9c2e9b168 100644
--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
@@ -32,8 +32,10 @@ class ReluCompute
   std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/relu_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -46,7 +48,7 @@ class ReluCompute
     auto* x_buf = param.X->data<float, cl::Buffer>();
     auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     VLOG(4) << TargetToStr(param.X->target());
     VLOG(4) << TargetToStr(param.Out->target());
@@ -60,6 +62,7 @@ class ReluCompute
     CL_CHECK_FATAL(status);
 
     auto global_work_size = cl::NDRange{count};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -74,7 +77,8 @@ class ReluCompute
  private:
   std::string kernel_func_name_{"relu"};
   std::string build_options_{"-DCL_DTYPE_float -DRELU"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 class SigmoidCompute
@@ -87,8 +91,10 @@ class SigmoidCompute
   }
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/sigmoid_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -101,7 +107,7 @@ class SigmoidCompute
     auto* x_buf = param.X->data<float, cl::Buffer>();
     auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     VLOG(4) << TargetToStr(param.X->target());
     VLOG(4) << TargetToStr(param.Out->target());
@@ -115,6 +121,7 @@ class SigmoidCompute
     CL_CHECK_FATAL(status);
 
     auto global_work_size = cl::NDRange{count};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -129,7 +136,8 @@ class SigmoidCompute
  private:
   std::string kernel_func_name_{"sigmoid"};
   std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/activation_image_compute.cc b/lite/kernels/opencl/activation_image_compute.cc
index dbe487ba91d00c2de4c08edf140526d727bac6b5..3b663cbd7d29da19122f2273c802bf47b4e1ebac 100644
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -37,11 +37,12 @@ class ActivationComputeImageDefault
   }
 
   void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
     act_param_ = param_.get_mutable<param_t>();
     int act_type = static_cast<int>(act_param_->active_type);
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(1) << "ActivationTypeToStr(act_param_->active_type):"
             << ActivationTypeToStr(act_param_->active_type);
+#endif
     switch (act_type) {
       case 1:
         kernel_func_name_ = "relu";
@@ -71,41 +72,70 @@ class ActivationComputeImageDefault
         LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
         return;
     }
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/activation_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_img = param.X->data<half_t, cl::Image2D>();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
+#endif
 
     auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/activation_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
 
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_img);
+  void ReInitWhenNeeded() override {
+    act_param_ = param_.get_mutable<param_t>();
+    auto x_dims = act_param_->X->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      x_img_shape_ = default_convertor.InitImageDimInfoWith(
+          act_param_->X->dims());  // w, h
+      out_img_shape_ = default_convertor.InitImageDimInfoWith(
+          act_param_->Out->dims());  // w, h
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+
+  void GetGlobalWorkSize() {
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
+                    static_cast<cl::size_type>(x_img_shape_[1])};
+  }
+
+  void Run() override {
+    auto* x_img = act_param_->X->data<half_t, cl::Image2D>();
+    auto* out_img = act_param_->Out->mutable_data<half_t, cl::Image2D>(
+        out_img_shape_[0], out_img_shape_[1]);
+
+    auto kernel = kernel_;
+    cl_int status;
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(1, *out_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, threshold_);
+    status = kernel.setArg(2, threshold_);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, scale_);
+    status = kernel.setArg(3, scale_);
     CL_CHECK_FATAL(status);
 
 #ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
+    const auto& x_dims = act_param_->X->dims();
+    const auto& y_dims = act_param_->Out->dims();  // useless: check dim only
+    VLOG(4) << TargetToStr(act_param_->X->target());
+    VLOG(4) << TargetToStr(act_param_->Out->target());
+    VLOG(4) << "x_img_shape_(w,h):" << x_img_shape_[0] << " "
+            << x_img_shape_[1];
     VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
             << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
     VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
@@ -115,13 +145,13 @@ class ActivationComputeImageDefault
     VLOG(4) << "kernel func name:" << kernel_func_name_;
 #endif
 
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
-        global_work_size,
+        global_work_size_,
         cl::NullRange,
         nullptr,
         event_.get());
@@ -131,11 +161,21 @@ class ActivationComputeImageDefault
 
  private:
   param_t* act_param_{nullptr};
+  DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim last_x_dims_;
   std::string kernel_func_name_{};
   float threshold_{6.f};
   float scale_{1.f};
+  cl::Kernel kernel_;
+  bool first_epoch_for_reinit_{true};
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl
 }  // namespace kernels
diff --git a/lite/kernels/opencl/bilinear_interp_image_compute.cc b/lite/kernels/opencl/bilinear_interp_image_compute.cc
index 7e32010c0b5ff5cedad8b0da7ce7233fbf73da6f..d5143da9bd32941e7be5e4d46ca95261e83a9a90 100644
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
@@ -43,8 +43,10 @@ class BilinearInterpImageCompute
     bilinear_interp_param_ = param_.get_mutable<param_t>();
 
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/bilinear_interp_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/bilinear_interp_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -103,7 +105,7 @@ class BilinearInterpImageCompute
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -140,6 +142,7 @@ class BilinearInterpImageCompute
                     static_cast<cl::size_type>(default_work_size[1]),
                     static_cast<cl::size_type>(default_work_size[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -159,7 +162,8 @@ class BilinearInterpImageCompute
   param_t* bilinear_interp_param_{nullptr};
   std::string kernel_func_name_{"bilinear_interp"};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/box_coder_image_compute.cc b/lite/kernels/opencl/box_coder_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d44610faaa4107031a7d225bbeaaf38144f52a17
--- /dev/null
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
+                                               PRECISION(kFP16),
+                                               DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::BoxCoderParam;
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    boxcoder_param_ = param_.get_mutable<param_t>();
+    if (boxcoder_param_->code_type == "decode_center_size" &&
+        boxcoder_param_->box_normalized == true) {
+      kernel_func_name_ = "decode_center_size";
+    } else {
+      printf("This code_type %s doesn't support \n",
+             boxcoder_param_->code_type.c_str());
+      return;
+    }
+    CHECK(context.cl_context() != nullptr);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/box_coder_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    boxcoder_param_ = param_.get_mutable<param_t>();
+    const auto& out_dims = boxcoder_param_->proposals->dims();
+    auto image_shape = InitImageDimInfoWith(out_dims);
+
+    auto* out_buf =
+        boxcoder_param_->proposals->mutable_data<half_t, cl::Image2D>(
+            image_shape["width"], image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "boxcoder input shape:  ";
+
+#endif
+    const auto* input_priorbox = boxcoder_param_->prior_box;
+    const auto* input_priorboxvar = boxcoder_param_->prior_box_var;
+    const auto* input_targetbox = boxcoder_param_->target_box;
+    const auto& code_type = boxcoder_param_->code_type;
+    if (code_type == "decode_center_size") {
+      auto* prior_box_image = input_priorbox->data<half_t, cl::Image2D>();
+      auto* prior_box_var_image =
+          input_priorboxvar->data<half_t, cl::Image2D>();
+      auto* target_box_image = input_targetbox->data<half_t, cl::Image2D>();
+
+      int new_dims[4] = {1, 1, 1, 1};
+      for (int i = 0; i < out_dims.size(); i++) {
+        new_dims[4 - out_dims.size() + i] = out_dims[i];
+      }
+      auto& context = ctx_->As<OpenCLContext>();
+      CHECK(context.cl_context() != nullptr);
+      STL::stringstream kernel_key;
+      kernel_key << kernel_func_name_ << build_options_;
+      auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+      auto default_work_size =
+          DefaultWorkSize(out_dims,
+                          DDim(std::vector<DDim::value_type>{
+                              static_cast<int64_t>(image_shape["width"]),
+                              static_cast<int64_t>(image_shape["height"])}));
+
+      int out_C = new_dims[1];
+      int out_H = new_dims[2];
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(4) << TargetToStr(boxcoder_param_->proposals->target());
+      VLOG(4) << "output shape: " << out_dims[0] << ", " << out_dims[1] << ", "
+              << out_dims[2] << ", " << out_dims[3];
+      VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+              << image_shape["height"];
+      VLOG(4) << "out_C = " << out_C;
+      VLOG(4) << "out_H = " << out_H;
+      VLOG(4) << "default_work_size = " << default_work_size[0] << ", "
+              << default_work_size[1] << ", " << default_work_size[2];
+#endif
+      int arg_idx = 0;
+      cl_int status = kernel.setArg(arg_idx++, *prior_box_image);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, *prior_box_var_image);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, *target_box_image);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, out_C);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, out_H);
+      CL_CHECK_FATAL(status);
+      auto global_work_size =
+          cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                      static_cast<cl::size_type>(default_work_size[2])};
+
+      event_ = std::shared_ptr<cl::Event>(new cl::Event);
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_wait_list()->emplace(out_buf, event_);
+
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
+              << global_work_size[1];
+#endif
+    }
+  }
+  std::string doc() { return "Boxcoder using cl::Image, kFP16"; }
+
+  param_t* boxcoder_param_{nullptr};
+  std::string kernel_func_name_{};
+  std::string build_options_{" -DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{nullptr};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+typedef paddle::lite::kernels::opencl::BoxCoderComputeImage BoxCoder_image;
+
+REGISTER_LITE_KERNEL(
+    box_coder, kOpenCL, kFP16, kImageDefault, BoxCoder_image, ImageDefault)
+    .BindInput("PriorBox",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("PriorBoxVar",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("TargetBox",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("OutputBox",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/box_coder_image_compute_test.cc b/lite/kernels/opencl/box_coder_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ab37a8b015a80c0389bd6f62bb07c70c0d14a74a
--- /dev/null
+++ b/lite/kernels/opencl/box_coder_image_compute_test.cc
@@ -0,0 +1,298 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
+namespace paddle {
+namespace lite {
+void box_coder_ref(float* proposals_data,
+                   const float* anchors_data,
+                   const float* bbox_deltas_data,
+                   const float* variances_data,
+                   int axis,
+                   bool box_normalized,
+                   std::string code_type,
+                   int row,
+                   int col) {
+  if (code_type == "decode_center_size") {
+    int anchor_len = 4;
+    int out_len = 4;
+    int var_len = 4;
+    int delta_len = 4;
+    float normalized = !box_normalized ? 1.f : 0;
+
+    for (int64_t row_id = 0; row_id < row; ++row_id) {
+      for (int64_t col_id = 0; col_id < col; ++col_id) {
+        size_t delta_offset = row_id * col * delta_len + col_id * delta_len;
+        size_t out_offset = row_id * col * out_len + col_id * out_len;
+        int prior_box_offset =
+            axis == 0 ? col_id * anchor_len : row_id * anchor_len;
+        int var_offset = axis == 0 ? col_id * var_len : row_id * var_len;
+        auto anchor_data_tmp = anchors_data + prior_box_offset;
+        auto bbox_deltas_data_tmp = bbox_deltas_data + delta_offset;
+        auto proposals_data_tmp = proposals_data + out_offset;
+        auto anchor_width =
+            anchor_data_tmp[2] - anchor_data_tmp[0] + normalized;
+        auto anchor_height =
+            anchor_data_tmp[3] - anchor_data_tmp[1] + normalized;
+        auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width;
+        auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height;
+        float bbox_center_x = 0, bbox_center_y = 0;
+        float bbox_width = 0, bbox_height = 0;
+
+        auto variances_data_tmp = variances_data + var_offset;
+        bbox_center_x =
+            variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width +
+            anchor_center_x;
+        bbox_center_y =
+            variances_data_tmp[1] * bbox_deltas_data_tmp[1] * anchor_height +
+            anchor_center_y;
+        bbox_width = std::exp(variances_data_tmp[2] * bbox_deltas_data_tmp[2]) *
+                     anchor_width;
+        bbox_height =
+            std::exp(variances_data_tmp[3] * bbox_deltas_data_tmp[3]) *
+            anchor_height;
+        proposals_data_tmp[0] = bbox_center_x - bbox_width / 2;
+        proposals_data_tmp[1] = bbox_center_y - bbox_height / 2;
+        proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized;
+        proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized;
+      }
+    }
+  } else if (code_type == "encode_center_size") {
+    LOG(FATAL) << "not implemented type: " << code_type;
+  } else {
+    LOG(FATAL) << "not supported type: " << code_type;
+  }
+}
+// #define BOXCODER_FP16_LOOP_TEST
+// #define BOXCODER_FP16_PRINT_RESULT
+TEST(box_coder_image2d, compute) {
+#ifdef BOXCODER_FP16_LOOP_TEST
+  for (auto n : {1, 2, 3, 4}) {
+    for (auto m : {1, 3, 4, 8}) {
+      for (auto norm : {true}) {
+        for (auto code_type : {"decode_center_size"}) {
+          for (auto axis : {0}) {
+#else
+  const int n = 1;
+  const int m = 1;
+  const bool norm = true;
+  const std::string code_type = "decode_center_size";
+  const int axis = 0;
+#endif  // BOXCODER_FP16_LOOP_TEST
+
+            LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << m
+                      << " ========";
+            LOG(INFO) << "======== parameters: norm = " << norm
+                      << ", axis = " << axis << "code_type: " << code_type;
+
+            auto kernels =
+                KernelRegistry::Global().Create("box_coder",
+                                                TARGET(kOpenCL),
+                                                PRECISION(kFP16),
+                                                DATALAYOUT(kImageDefault));
+            ASSERT_FALSE(kernels.empty());
+            auto kernel = std::move(kernels.front());
+            LOG(INFO) << "get kernel:" << kernel->doc();
+
+            lite::Tensor prior_box, prior_box_var, target_box, output_box;
+            operators::BoxCoderParam param;
+            param.prior_box = &prior_box;
+            param.prior_box_var = &prior_box_var;
+            param.target_box = &target_box;
+            param.proposals = &output_box;
+            param.axis = axis;
+            param.box_normalized = norm;
+            param.code_type = code_type;
+
+            std::unique_ptr<KernelContext> context(new KernelContext);
+            context->As<OpenCLContext>().InitOnce();
+
+            kernel->SetParam(param);
+            std::unique_ptr<KernelContext> boxcoder_context(new KernelContext);
+            context->As<OpenCLContext>().CopySharedTo(
+                &(boxcoder_context->As<OpenCLContext>()));
+            kernel->SetContext(std::move(boxcoder_context));
+
+            const DDim prior_box_dims =
+                DDim(std::vector<DDim::value_type>{1, 1, m, 4});
+            const DDim prior_box_var_dims =
+                DDim(std::vector<DDim::value_type>{1, 1, m, 4});
+            const DDim target_box_dims =
+                DDim(std::vector<DDim::value_type>{1, n, m, 4});
+            const DDim out_dim =
+                DDim(std::vector<DDim::value_type>{1, n, m, 4});
+            prior_box.Resize(prior_box_dims);
+            prior_box_var.Resize(prior_box_var_dims);
+            target_box.Resize(target_box_dims);
+            output_box.Resize(out_dim);
+
+            std::vector<float> prior_box_data(prior_box_dims.production());
+            std::vector<float> prior_box_var_data(
+                prior_box_var_dims.production());
+            std::vector<float> target_box_data(target_box_dims.production());
+            for (int i = 0; i < prior_box_dims.production(); i++) {
+              prior_box_data[i] = i * 1.1 / prior_box_dims.production();
+            }
+            for (int i = 0; i < prior_box_var_dims.production(); i++) {
+              prior_box_var_data[i] = i * 1.2 / prior_box_var_dims.production();
+            }
+            for (int i = 0; i < target_box_dims.production(); i++) {
+              target_box_data[i] = i * 1.3 / target_box_dims.production();
+            }
+
+            LOG(INFO) << "prepare input";
+            CLImageConverterDefault* default_converter =
+                new CLImageConverterDefault();
+            DDim prior_box_image_shape =
+                default_converter->InitImageDimInfoWith(prior_box_dims);
+            LOG(INFO) << "prior_box_image_shape = " << prior_box_image_shape[0]
+                      << " " << prior_box_image_shape[1];
+            std::vector<half_t> prior_box_image_data(
+                prior_box_image_shape.production() * 4);  // 4 : RGBA
+            default_converter->NCHWToImage(prior_box_data.data(),
+                                           prior_box_image_data.data(),
+                                           prior_box_dims);
+            auto* prior_box_image = prior_box.mutable_data<half_t, cl::Image2D>(
+                prior_box_image_shape[0],
+                prior_box_image_shape[1],
+                prior_box_image_data.data());
+
+            DDim prior_box_var_image_shape =
+                default_converter->InitImageDimInfoWith(prior_box_var_dims);
+            LOG(INFO) << "prior_box_var_image_shape = "
+                      << prior_box_var_image_shape[0] << " "
+                      << prior_box_var_image_shape[1];
+            std::vector<half_t> prior_box_var_image_data(
+                prior_box_var_image_shape.production() * 4);  // 4 : RGBA
+            default_converter->NCHWToImage(prior_box_var_data.data(),
+                                           prior_box_var_image_data.data(),
+                                           prior_box_var_dims);
+            auto* prior_box_var_image =
+                prior_box_var.mutable_data<half_t, cl::Image2D>(
+                    prior_box_var_image_shape[0],
+                    prior_box_var_image_shape[1],
+                    prior_box_var_image_data.data());
+
+            DDim target_box_image_shape =
+                default_converter->InitImageDimInfoWith(target_box_dims);
+            LOG(INFO) << "target_box_image_shape = "
+                      << target_box_image_shape[0] << " "
+                      << target_box_image_shape[1];
+            std::vector<half_t> target_box_image_data(
+                target_box_image_shape.production() * 4);  // 4 : RGBA
+            default_converter->NCHWToImage(target_box_data.data(),
+                                           target_box_image_data.data(),
+                                           target_box_dims);
+            auto* target_box_image =
+                target_box.mutable_data<half_t, cl::Image2D>(
+                    target_box_image_shape[0],
+                    target_box_image_shape[1],
+                    target_box_image_data.data());
+
+            DDim out_image_shape =
+                default_converter->InitImageDimInfoWith(out_dim);
+            LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+                      << out_image_shape[1];
+            auto* out_image = output_box.mutable_data<half_t, cl::Image2D>(
+                out_image_shape[0], out_image_shape[1]);
+            kernel->Launch();
+
+            auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+            auto* out_ptr = param.proposals->data<half_t, cl::Image2D>();
+            auto it = wait_list->find(out_ptr);
+            if (it != wait_list->end()) {
+              VLOG(4) << "--- Find the sync event for the target cl "
+                         "tensor. ---";
+              auto& event = *(it->second);
+              event.wait();
+            } else {
+              LOG(FATAL) << "Could not find the sync event for the "
+                            "target cl tensor.";
+            }
+
+            lite::Tensor out_ref_tensor;
+            out_ref_tensor.Resize(out_dim);
+            box_coder_ref(out_ref_tensor.mutable_data<float>(),
+                          prior_box_data.data(),
+                          target_box_data.data(),
+                          prior_box_var_data.data(),
+                          axis,
+                          norm,
+                          code_type,
+                          target_box_dims[0],
+                          target_box_dims[1]);
+
+            const size_t cl_image2d_row_pitch{0};
+            const size_t cl_image2d_slice_pitch{0};
+            half_t* out_image_data =
+                new half_t[40000];  // [out_image_shape.production() * 4];
+            TargetWrapperCL::ImgcpySync(out_image_data,
+                                        out_image,
+                                        out_image_shape[0],
+                                        out_image_shape[1],
+                                        cl_image2d_row_pitch,
+                                        cl_image2d_slice_pitch,
+                                        IoDirection::DtoH);
+            float* out_data = new float[out_image_shape.production() * 4];
+            default_converter->ImageToNCHW(
+                out_image_data, out_data, out_image_shape, out_dim);
+// result
+#ifdef BOXCODER_FP16_PRINT_RESULT
+            LOG(INFO) << "---- print kernel result (input -> output) ----";
+            for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
+              std::cout << target_box_data[eidx] << " -> " << out_data[eidx]
+                        << std::endl;
+            }
+#endif  // BOXCODER_FP16_PRINT_RESULT
+            const float* out_ref = out_ref_tensor.data<float>();
+            for (int i = 0; i < out_dim.production(); i++) {
+              auto abs_diff = abs(out_data[i] - out_ref[i]);
+              auto relative_diff =
+                  COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+              EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
+                            (abs_diff <= FP16_MAX_DIFF),
+                        true);
+              if ((relative_diff > FP16_MAX_DIFF) &&
+                  (abs_diff > FP16_MAX_DIFF)) {
+                LOG(ERROR) << "error idx:" << i << ", in_data[" << i
+                           << "]: " << target_box_data[i] << ", out_data[" << i
+                           << "]: " << out_data[i] << ", out_ref[" << i
+                           << "]: " << out_ref[i] << ", abs_diff: " << abs_diff
+                           << ", relative_diff: " << relative_diff
+                           << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF;
+              }
+            }
+#ifdef BOXCODER_FP16_LOOP_TEST
+          }  // axis
+        }    // code_type
+      }      // norm
+    }        // m
+  }          // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(box_coder, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/concat_buffer_compute.cc b/lite/kernels/opencl/concat_buffer_compute.cc
index 010e7726170ab1f40adc2fcb56a66835ac7d2bd2..aebffe3a5764f7207b47b938ee724424f648a987 100644
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
@@ -38,8 +38,10 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
     } else {
       kernel_func_name_ = "concat_mul";
     }
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/concat_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
 
     auto axis = concat_param_->axis;
     auto inputs = concat_param_->x;
@@ -88,7 +90,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
 
     auto inputs = param.x;
     int arg_idx = 0;
@@ -121,6 +123,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
       CL_CHECK_FATAL(status);
       status = kernel.setArg(++arg_idx, total1);
       CL_CHECK_FATAL(status);
+      event_ = std::shared_ptr<cl::Event>(new cl::Event);
       status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
           kernel,
           cl::NullRange,
@@ -154,6 +157,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
         CL_CHECK_FATAL(status);
         status = kernel.setArg(++arg_idx, total0);
         CL_CHECK_FATAL(status);
+        event_ = std::shared_ptr<cl::Event>(new cl::Event);
         status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
             kernel,
             cl::NullRange,
@@ -177,7 +181,8 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
   param_t* concat_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/concat_image_compute.cc b/lite/kernels/opencl/concat_image_compute.cc
index 95e64025662a4b87cd68c211ccc0b0fb7b84a9f2..9d248d0718ee468cbfca032c5270853d78ad8019 100644
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -40,8 +40,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
       kernel_func_name_ = "concat_mul";
     }
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/concat_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/concat_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
 
     auto axis = concat_param_->axis;
     auto inputs = concat_param_->x;
@@ -117,7 +119,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
 
     auto inputs = param.x;
     int arg_idx = 0;
@@ -185,6 +187,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
       CL_CHECK_FATAL(status);
       status = kernel.setArg(++arg_idx, width_);
       CL_CHECK_FATAL(status);
+      event_ = std::shared_ptr<cl::Event>(new cl::Event);
       status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
           kernel,
           cl::NullRange,
@@ -228,6 +231,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
         status = kernel.setArg(++arg_idx, width_);
         CL_CHECK_FATAL(status);
         CL_CHECK_FATAL(status);
+        event_ = std::shared_ptr<cl::Event>(new cl::Event);
         status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
             kernel,
             cl::NullRange,
@@ -251,7 +255,8 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
   param_t* concat_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{" -DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/conv_buffer_compute.cc b/lite/kernels/opencl/conv_buffer_compute.cc
index 65477e89c7d00408bf4d639138dea936a61a3d70..51e3eab352ef92ae8547e52691afcc8c5889f446 100644
--- a/lite/kernels/opencl/conv_buffer_compute.cc
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
@@ -114,8 +114,10 @@ void ConvCompute::PrepareForRun() {
   }
 
   for (size_t i = 0; i < kernel_func_names_.size(); i++) {
-    context.cl_context()->AddKernel(
-        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+    context.cl_context()->AddKernel(kernel_func_names_[i],
+                                    kernel_func_paths_[i],
+                                    build_options_[i],
+                                    time_stamp_);
   }
 }
 
@@ -153,7 +155,7 @@ void ConvCompute::GemmlikeConv2d() {
 
   auto& context = ctx_->As<OpenCLContext>();
   std::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
+  kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
   auto img2col_kernel = context.cl_context()->GetKernel(kernel_key.str());
 
   int n_threads = c_in * h_out * w_out;
@@ -203,6 +205,7 @@ void ConvCompute::GemmlikeConv2d() {
     CL_CHECK_FATAL(status);
 
     auto global_work_size = cl::NDRange{static_cast<size_t>(out_stride)};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         img2col_kernel,
         cl::NullRange,
@@ -218,7 +221,7 @@ void ConvCompute::GemmlikeConv2d() {
   int n = h_out * w_out;
   VLOG(4) << "m = " << m << " n = " << n << " k = " << k;
   kernel_key.str("");
-  kernel_key << kernel_func_names_[1] << build_options_[1];
+  kernel_key << kernel_func_names_[1] << build_options_[1] << time_stamp_;
   auto gemm_kernel = context.cl_context()->GetKernel(kernel_key.str());
   GemmBatched(
       gemm_kernel, col_buf, filter_buf, bias_buf, output_buf, bs, m, n, k);
@@ -249,7 +252,8 @@ void ConvCompute::Conv2d1x1() {
 
   auto& context = ctx_->As<OpenCLContext>();
   std::stringstream kernel_key;
-  kernel_key << kernel_func_names_.front() << build_options_.front();
+  kernel_key << kernel_func_names_.front() << build_options_.front()
+             << time_stamp_;
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
   GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
@@ -297,6 +301,7 @@ void ConvCompute::GemmBatched(cl::Kernel& kernel,
   status = kernel.setArg(++arg_idx, batch_size);
   CL_CHECK_FATAL(status);
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
diff --git a/lite/kernels/opencl/conv_buffer_compute.h b/lite/kernels/opencl/conv_buffer_compute.h
index 44ada55d92352edf3c64cd653e832b26718cdd2f..531ffb5402cee45ddfd4bdd5346ec151c33b217a 100644
--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
@@ -21,6 +21,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 
 namespace paddle {
@@ -55,7 +56,8 @@ class ConvCompute
   std::vector<std::string> kernel_func_names_{};
   std::vector<std::string> kernel_func_paths_{};
   std::vector<std::string> build_options_{};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index d664e37150fcc661e4bb97ed57a42364dd0d475d..7b0e26cf43bb081994c3f92931ebfa51f0962bc0 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -14,8 +14,8 @@
 
 #include "lite/kernels/opencl/conv_image_compute.h"
 
+#include <iomanip>
 #include <sstream>
-
 #include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
@@ -38,6 +38,7 @@ void ConvImageCompute::PrepareForRun() {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
 
+  filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
   int bs = x_dims[0];
   int c_in = x_dims[1];
   int h_out = output_dims[2];
@@ -78,9 +79,27 @@ void ConvImageCompute::PrepareForRun() {
   VLOG(3) << "dilation_equal:" << dilation_equal;
   VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " "
           << paddings[2] << " " << paddings[3];
-
   CHECK(pad_equal && stride_equal && dilation_equal);
 
+  // general gws..
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  default_c_blk_ = default_work_size[0];
+  default_w_blk_ = default_work_size[1];
+  default_nh_blk_ = default_work_size[2];
+  c_blk_ = default_c_blk_;
+  w_blk_ = default_w_blk_;
+  nh_blk_ = default_nh_blk_;
+  global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                  static_cast<size_t>(w_blk_),
+                                  static_cast<size_t>(nh_blk_)};
+
   if (kernel_h == 1 && kernel_w == 1) {
     // conv2d_1x1
     if (param.x->dims()[1] % 4 == 0) {
@@ -95,10 +114,19 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d1x1opt;
+    {
+      // calc 1x1 gws
+      w_blk_ = maptofactor(default_w_blk_, 4);
+      c_blk_ = default_c_blk_;
+      nh_blk_ = default_nh_blk_;
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
 #define DEPTH_CONV_USE_SPL
 #ifdef DEPTH_CONV_USE_SPL
   } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
@@ -107,9 +135,38 @@ void ConvImageCompute::PrepareForRun() {
     if (stride_h == 1 && dilations[0] == 1) {
       kernel_func_names_.push_back("depth_conv2d_3x3s1");
       impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
+      {
+        // depthwise spl gws s1
+        int c_block = (output_dims[1] + 3) / 4;
+        int w = output_dims[3];
+        int nh = output_dims[0] * output_dims[2];
+        int w_blk_size = 2;
+        int w_blk = (w + w_blk_size - 1) / w_blk_size;
+
+        c_blk_ = c_block;
+        w_blk_ = w_blk;
+        nh_blk_ = nh;
+        global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                        static_cast<size_t>(w_blk_),
+                                        static_cast<size_t>(nh_blk_)};
+      }
     } else {
       kernel_func_names_.push_back("depth_conv2d_3x3");
       impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
+      {
+        // depthwise spl gws
+        int c_block = (output_dims[1] + 3) / 4;
+        int w = output_dims[3];
+        int nh = output_dims[0] * output_dims[2];
+
+        c_blk_ = c_block;
+        w_blk_ = w;
+        nh_blk_ = nh;
+
+        global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                        static_cast<size_t>(w_blk_),
+                                        static_cast<size_t>(nh_blk_)};
+      }
     }
     kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
 
@@ -118,7 +175,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
 #endif
@@ -138,7 +195,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::DepthwiseConv2d;
@@ -153,10 +210,26 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d3x3opt;
+    {
+      int w_blk_size = 5;
+      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+      int h_blk_size = 1;
+      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+      c_blk_ = default_c_blk_;
+      w_blk_ = w_blk;
+      nh_blk_ = h_blk;
+
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
+
   } else if (kernel_h == 5 && kernel_w == 5) {
 #define CONV_5x5_OPT
 #ifndef CONV_5x5_OPT
@@ -169,7 +242,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d5x5;
@@ -185,10 +258,25 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d5x5opt;
+    {
+      int w_blk_size = 5;
+      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+      int h_blk_size = 1;
+      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+      c_blk_ = default_c_blk_;
+      w_blk_ = w_blk;
+      nh_blk_ = h_blk;
+
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
 #endif
 #undef CONV_5x5_OPT
   } else if (kernel_h == 7 && kernel_w == 7) {
@@ -203,7 +291,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    this->filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    this->filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d7x7;
@@ -219,10 +307,25 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    this->filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    this->filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d7x7opt;
+    {
+      int w_blk_size = 5;
+      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+      int h_blk_size = 1;
+      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+      c_blk_ = default_c_blk_;
+      w_blk_ = w_blk;
+      nh_blk_ = h_blk;
+
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
 #endif
 #undef CONV_7x7_OPT
 
@@ -247,6 +350,7 @@ void ConvImageCompute::PrepareForRun() {
   const bool is_element_wise_bias =
       has_bias && param.output->dims() == param.bias->dims();
   if (has_bias) {
+    bias_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
     build_options_single +=
         is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
 
@@ -259,7 +363,7 @@ void ConvImageCompute::PrepareForRun() {
     float* bias_cpu_data = param.bias->mutable_data<float>();
     bias_converter.NCHWToImage(
         bias_cpu_data, bias_image_v.data(), param.bias->dims());
-    this->bias_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    this->bias_gpu_image_->mutable_data<half_t, cl::Image2D>(
         bias_image_dims[0], bias_image_dims[1], bias_image_v.data());
     // convert cpu buffer bias --> gpu image --- end ----
   }
@@ -267,18 +371,72 @@ void ConvImageCompute::PrepareForRun() {
   build_options_.push_back(build_options_single);
 
   for (size_t i = 0; i < kernel_func_names_.size(); i++) {
-    context.cl_context()->AddKernel(
-        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+    context.cl_context()->AddKernel(kernel_func_names_[i],
+                                    kernel_func_paths_[i],
+                                    build_options_[i],
+                                    time_stamp_);
+  }
+
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+
+  std::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
+  kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  size_t max_work_group_size = 0;
+  kernel_.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
+                                   CL_KERNEL_WORK_GROUP_SIZE,
+                                   &max_work_group_size);
+
+  VLOG(4) << "max_work_group_size: " << max_work_group_size;
+
+  if (max_work_group_size > 0 && use_lws_) {
+    double min_turn_time = DBL_MAX;
+    cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
+        global_work_size_, max_work_group_size);
+    cl::NDRange last_local_work_size = cl::NDRange{
+        static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
+    if (use_turn_) {
+      for (size_t i = 1; i < 15; i++) {
+        if (kernel_h == 1 && kernel_w == 1) {
+          // todo use diff logics
+          local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
+              global_work_size_, max_work_group_size, i);
+        } else {
+          local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
+              global_work_size_, max_work_group_size, i);
+        }
+        if (last_local_work_size[0] == local_work_size_[0] &&
+            last_local_work_size[1] == local_work_size_[1] &&
+            last_local_work_size[2] == local_work_size_[2]) {
+          // skiped turned lws
+          continue;
+        }
+        auto turn_time = this->Turn(5);
+        if (min_turn_time > turn_time) {
+          min_turn_time = turn_time;
+          best_local_work_size = local_work_size_;
+        }
+        last_local_work_size = local_work_size_;
+      }
+    }
+    local_work_size_ = best_local_work_size;
+    VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
+            << local_work_size_[1] << "," << local_work_size_[2] << "}";
   }
 }
 
-void ConvImageCompute::Conv2d1x1opt() {
+void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
   auto strides = param.strides;
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -302,16 +460,11 @@ void ConvImageCompute::Conv2d1x1opt() {
   int input_c = input_dims[1];
   auto dilations = *param.dilations;
 
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
+#ifndef LITE_SHUTDOWN_LOG
+  //  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+#endif
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d_1x1 params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -331,9 +484,9 @@ void ConvImageCompute::Conv2d1x1opt() {
   VLOG(4) << "offset: " << offset;
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
+// VLOG(4) << "default work size{c_block, w, nh}: "
+//         << "{" << c_block << ", " << w << ", " << nh << ""
+//         << "}";
 #endif
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -347,30 +500,17 @@ void ConvImageCompute::Conv2d1x1opt() {
   const cl::Buffer* bias_buf = nullptr;
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  std::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  int maped_w = maptofactor(w, 4);
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "maped_w: " << maped_w;
-  VLOG(4) << "hasbias: " << has_bias;
-#endif
-
+  auto kernel = kernel_;
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, maped_w);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
@@ -401,56 +541,34 @@ void ConvImageCompute::Conv2d1x1opt() {
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, default_w_blk_);
   CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(maped_w),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
-#ifndef LITE_SHUTDOWN_LOG
-  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
-#endif
-
-  size_t max_work_group_size = 0;
-  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
-                                  CL_KERNEL_WORK_GROUP_SIZE,
-                                  &max_work_group_size);
-  cl::NDRange local_work_size = cl::NullRange;
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "max_work_group_size: " << max_work_group_size;
-#endif
-  if (max_work_group_size > 0 && use_lws) {
-    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
-                                                          max_work_group_size);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
-            << local_work_size[1] << "," << local_work_size[2] << "}";
-#endif
-  }
-
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
-      local_work_size,
+      global_work_size_,
+      local_work_size_,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::Conv2d3x3() {
+void ConvImageCompute::Conv2d3x3(bool is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
   auto strides = param.strides;
 
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -486,24 +604,14 @@ void ConvImageCompute::Conv2d3x3() {
   } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
     new_groups = input_channel / filter_channel;
   }
-  /* TODO(ysh329): mobile has no case below
-     else {
-      LOG(FATAL) << "Not support conv3x3 case with"
-                 << " input_dims:" << input_dims << " output_dims:" <<
-    output_dims
-                 << " filter_dims:" << filter_dims;
-    }
-  */
-
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
+/* TODO(ysh329): mobile has no case below
+   else {
+    LOG(FATAL) << "Not support conv3x3 case with"
+               << " input_dims:" << input_dims << " output_dims:" <<
+  output_dims
+               << " filter_dims:" << filter_dims;
+  }
+*/
 
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d params ============";
@@ -527,9 +635,9 @@ void ConvImageCompute::Conv2d3x3() {
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
   VLOG(4) << "param.groups(groups):" << param.groups;
   VLOG(4) << "new_groups:" << new_groups;
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
+// VLOG(4) << "default work size{c_block, w, nh}: "
+//         << "{" << c_block << ", " << w << ", " << nh << ""
+//         << "}";
 #endif
 
   CHECK_GE(dilations.size(), 2);
@@ -542,28 +650,17 @@ void ConvImageCompute::Conv2d3x3() {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
-
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
-#endif
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
@@ -607,29 +704,31 @@ void ConvImageCompute::Conv2d3x3() {
   status = kernel.setArg(++arg_idx, new_groups);
   CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
 #ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::Conv2d3x3opt() {
+void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
@@ -637,7 +736,7 @@ void ConvImageCompute::Conv2d3x3opt() {
   auto dilations = *param.dilations;
 
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -657,24 +756,6 @@ void ConvImageCompute::Conv2d3x3opt() {
   const bool is_element_wise_bias =
       has_bias && param.output->dims() == param.bias->dims();
 
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 5;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-  // default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-// default_work_size[2] = h_blk;
-
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d params ============";
   // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -692,9 +773,6 @@ void ConvImageCompute::Conv2d3x3opt() {
   VLOG(4) << "strides: " << strides[0] << "," << strides[1];
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif
 
   CHECK_GE(dilations.size(), 2);
@@ -707,27 +785,18 @@ void ConvImageCompute::Conv2d3x3opt() {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-#endif
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, h_blk);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
@@ -763,51 +832,36 @@ void ConvImageCompute::Conv2d3x3opt() {
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(w_blk),
-                  static_cast<size_t>(h_blk)};
 #ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
-#endif
-
-  size_t max_work_group_size = 0;
-  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
-                                  CL_KERNEL_WORK_GROUP_SIZE,
-                                  &max_work_group_size);
-  cl::NDRange local_work_size = cl::NullRange;
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "max_work_group_size: " << max_work_group_size;
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
-  if (max_work_group_size > 0 && use_lws) {
-    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
-                                                          max_work_group_size);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
-            << local_work_size[1] << "," << local_work_size[2] << "}";
-#endif
-  }
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
-      local_work_size,
+      global_work_size_,
+      local_work_size_,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::Conv2d5x5() {
+void ConvImageCompute::Conv2d5x5(bool is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
   auto strides = param.strides;
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -833,16 +887,6 @@ void ConvImageCompute::Conv2d5x5() {
   int input_c = input_dims[1];
   auto dilations = *param.dilations;
 
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -863,9 +907,6 @@ void ConvImageCompute::Conv2d5x5() {
   VLOG(4) << "offset: " << offset;
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif
 
   CHECK_GE(dilations.size(), 2);
@@ -878,28 +919,18 @@ void ConvImageCompute::Conv2d5x5() {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
-#endif
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
@@ -933,29 +964,30 @@ void ConvImageCompute::Conv2d5x5() {
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
 #ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::Conv2d5x5opt() {
+void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
@@ -963,7 +995,7 @@ void ConvImageCompute::Conv2d5x5opt() {
   auto dilations = *param.dilations;
 
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -984,22 +1016,6 @@ void ConvImageCompute::Conv2d5x5opt() {
   const bool is_element_wise_bias =
       has_bias && param.output->dims() == param.bias->dims();
 
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 5;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-  // default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
 // default_work_size[2] = h_blk;
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d params ============";
@@ -1018,9 +1034,6 @@ void ConvImageCompute::Conv2d5x5opt() {
   VLOG(4) << "strides: " << strides[0] << "," << strides[1];
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -1032,25 +1045,17 @@ void ConvImageCompute::Conv2d5x5opt() {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-#endif
+  auto kernel = kernel_;
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, h_blk);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
@@ -1083,51 +1088,32 @@ void ConvImageCompute::Conv2d5x5opt() {
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(w_blk),
-                  static_cast<size_t>(h_blk)};
-
-//  VLOG(4) << "out_image: " << out_image;
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
-#endif
-  size_t max_work_group_size = 0;
-  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
-                                  CL_KERNEL_WORK_GROUP_SIZE,
-                                  &max_work_group_size);
-  cl::NDRange local_work_size = cl::NullRange;
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "max_work_group_size: " << max_work_group_size;
-#endif
-  if (max_work_group_size > 0 && use_lws) {
-    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
-                                                          max_work_group_size);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
-            << local_work_size[1] << "," << local_work_size[2] << "}";
-#endif
-  }
+  //  VLOG(4) << "out_image: " << out_image;
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
-      local_work_size,
+      global_work_size_,
+      local_work_size_,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::Conv2d7x7() {
+void ConvImageCompute::Conv2d7x7(bool is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
   auto strides = param.strides;
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -1153,16 +1139,6 @@ void ConvImageCompute::Conv2d7x7() {
   int input_c = input_dims[1];
   auto dilations = *param.dilations;
 
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -1183,9 +1159,6 @@ void ConvImageCompute::Conv2d7x7() {
   VLOG(4) << "offset: " << offset;
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif
 
   CHECK_GE(dilations.size(), 2);
@@ -1198,28 +1171,18 @@ void ConvImageCompute::Conv2d7x7() {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
-#endif
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
@@ -1253,28 +1216,30 @@ void ConvImageCompute::Conv2d7x7() {
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
 #ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
-void ConvImageCompute::Conv2d7x7opt() {
+void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
@@ -1282,7 +1247,7 @@ void ConvImageCompute::Conv2d7x7opt() {
   auto dilations = *param.dilations;
 
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -1302,23 +1267,6 @@ void ConvImageCompute::Conv2d7x7opt() {
   const bool is_element_wise_bias =
       has_bias && param.output->dims() == param.bias->dims();
 
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 5;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-  // default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-// default_work_size[2] = h_blk;
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d 7x7 params ============";
   // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -1336,9 +1284,6 @@ void ConvImageCompute::Conv2d7x7opt() {
   VLOG(4) << "strides: " << strides[0] << "," << strides[1];
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -1350,27 +1295,18 @@ void ConvImageCompute::Conv2d7x7opt() {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-#endif
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, h_blk);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
@@ -1403,39 +1339,24 @@ void ConvImageCompute::Conv2d7x7opt() {
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(w_blk),
-                  static_cast<size_t>(h_blk)};
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
-#endif
-  size_t max_work_group_size = 0;
-  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
-                                  CL_KERNEL_WORK_GROUP_SIZE,
-                                  &max_work_group_size);
-  cl::NDRange local_work_size = cl::NullRange;
-  if (max_work_group_size > 0 && use_lws) {
-    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
-                                                          max_work_group_size);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
-            << local_work_size[1] << "," << local_work_size[2] << "}";
-#endif
-  }
-
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
-      local_work_size,
+      global_work_size_,
+      local_work_size_,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
-void ConvImageCompute::DepthwiseConv2d3x3s1() {
+void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto x_dims = param.x->dims();
   auto filter_dims = param.filter->dims();
@@ -1444,14 +1365,12 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
   auto strides = param.strides;
   auto dilations = *param.dilations;
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
   auto* input_img = param.x->data<half_t, cl::Image2D>();
-  auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_->data<half_t, cl::Image2D>();
 
   const cl::Image2D* bias_img = nullptr;
   if (param.bias) {
-    bias_img = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_img = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
   auto image_shape = InitImageDimInfoWith(output_dims);
@@ -1459,26 +1378,15 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
   auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
       image_shape["width"], image_shape["height"]);
 
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  int c_block = (output_dims[1] + 3) / 4;
-  int w = output_dims[3];
-  int nh = output_dims[0] * output_dims[2];
-
-  int w_blk_size = 2;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-
-  auto global_work_size = cl::NDRange(c_block, w_blk, nh);
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_img);
   CL_CHECK_FATAL(status);
@@ -1490,7 +1398,7 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
       has_bias && param.output->dims() == param.bias->dims();
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
 #ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
 #endif
@@ -1516,35 +1424,25 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
   status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
   CL_CHECK_FATAL(status);
 
-  size_t max_work_group_size = 0;
-  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
-                                  CL_KERNEL_WORK_GROUP_SIZE,
-                                  &max_work_group_size);
-  cl::NDRange local_work_size = cl::NullRange;
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "max_work_group_size: " << max_work_group_size;
-#endif
-  if (max_work_group_size > 0 && use_lws) {
-    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
-                                                          max_work_group_size);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
-            << local_work_size[1] << "," << local_work_size[2] << "}";
-#endif
-  }
-
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
-      local_work_size,
+      global_work_size_,
+      local_work_size_,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(output_img, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::DepthwiseConv2d3x3() {
+void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto x_dims = param.x->dims();
   auto filter_dims = param.filter->dims();
@@ -1555,14 +1453,12 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
   int offset = filter_dims[2] / 2 - paddings[0];
   int input_c_block = (x_dims[1] + 3) / 4;
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
   auto* input_img = param.x->data<half_t, cl::Image2D>();
-  auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_->data<half_t, cl::Image2D>();
 
   const cl::Image2D* bias_img = nullptr;
   if (param.bias) {
-    bias_img = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_img = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
   auto image_shape = InitImageDimInfoWith(output_dims);
@@ -1570,21 +1466,10 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
   auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
       image_shape["width"], image_shape["height"]);
 
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  int c_block = (output_dims[1] + 3) / 4;
-  int w = output_dims[3];
-  int nh = output_dims[0] * output_dims[2];
-  auto global_work_size = cl::NDRange(c_block, w, nh);
+  auto kernel = kernel_;
 
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "setArg";
-  VLOG(4) << "c_block = " << c_block;
-  VLOG(4) << "w = " << w;
-  VLOG(4) << "nh = " << nh;
-
   VLOG(4) << "strides = " << strides[0];
   VLOG(4) << "offset = " << offset;
   VLOG(4) << "dilations = " << dilations[0];
@@ -1597,11 +1482,11 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(w));
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_img);
   CL_CHECK_FATAL(status);
@@ -1612,7 +1497,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
       has_bias && param.output->dims() == param.bias->dims();
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
 #ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
 #endif
@@ -1638,24 +1523,31 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
   status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
   CL_CHECK_FATAL(status);
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(output_img, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::DepthwiseConv2d() {
+void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
   auto strides = param.strides;
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -1681,16 +1573,6 @@ void ConvImageCompute::DepthwiseConv2d() {
   int input_c = input_dims[1];
   auto dilations = *param.dilations;
 
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ depthwise conv2d params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -1710,9 +1592,6 @@ void ConvImageCompute::DepthwiseConv2d() {
   VLOG(4) << "offset: " << offset;
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif
 
   CHECK_GE(dilations.size(), 2);
@@ -1727,28 +1606,18 @@ void ConvImageCompute::DepthwiseConv2d() {
   const cl::Buffer* bias_buf = nullptr;
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
-#endif
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
@@ -1786,21 +1655,17 @@ void ConvImageCompute::DepthwiseConv2d() {
   status = kernel.setArg(++arg_idx, filter_height);
   CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
 #ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
@@ -1808,7 +1673,21 @@ void ConvImageCompute::DepthwiseConv2d() {
   context.cl_wait_list()->emplace(out_image, event_);
 }
 
-void ConvImageCompute::Run() { (this->*impl_)(); }
+void ConvImageCompute::Run() { (this->*impl_)(false); }
+
+double ConvImageCompute::Turn(int times) {
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  auto start = GetCurrentUS();
+  for (size_t i = 0; i < times; i++) {
+    (this->*impl_)(true);
+  }
+  auto time_diff = (GetCurrentUS() - start) / times;
+  return time_diff;
+}
 
 }  // namespace opencl
 }  // namespace kernels
diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h
index 57e4b91e0a842487fc5dfce4799fab244348772d..fbc659b50d55b3289209228e2ac52a3a19504d57 100644
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -22,44 +22,60 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
-
 class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
                                            PRECISION(kFP16),
                                            DATALAYOUT(kImageDefault)> {
  public:
   using param_t = operators::ConvParam;
-  using kernel_t = void (ConvImageCompute::*)();
+  using kernel_t = void (ConvImageCompute::*)(bool);
 
   void PrepareForRun() override;
 
   void Run() override;
+  double Turn(int times = 5);
 
  private:
-  void Conv2d1x1opt();
-  void Conv2d3x3();
-  void Conv2d3x3opt();
-  void Conv2d5x5();
-  void Conv2d5x5opt();
-  void Conv2d7x7();
-  void Conv2d7x7opt();
-  void DepthwiseConv2d3x3s1();
-  void DepthwiseConv2d3x3();
-  void DepthwiseConv2d();
+  void Conv2d1x1opt(bool is_turn = false);
+  void Conv2d3x3(bool is_turn = false);
+  void Conv2d3x3opt(bool is_turn = false);
+  void Conv2d5x5(bool is_turn = false);
+  void Conv2d5x5opt(bool is_turn = false);
+  void Conv2d7x7(bool is_turn = false);
+  void Conv2d7x7opt(bool is_turn = false);
+  void DepthwiseConv2d3x3s1(bool is_turn = false);
+  void DepthwiseConv2d3x3(bool is_turn = false);
+  void DepthwiseConv2d(bool is_turn = false);
 
   kernel_t impl_;
   std::vector<std::string> kernel_func_names_{};
   std::vector<std::string> kernel_func_paths_{};
   std::vector<std::string> build_options_{};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-  Tensor filter_gpu_image_;
-  Tensor bias_gpu_image_;
-  bool use_lws{true};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
+  std::unique_ptr<Tensor> filter_gpu_image_{nullptr};
+  std::unique_ptr<Tensor> bias_gpu_image_{nullptr};
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+  int c_blk_ = 1;
+  int w_blk_ = 1;
+  int nh_blk_ = 1;
+
+  int default_c_blk_ = 1;
+  int default_w_blk_ = 1;
+  int default_nh_blk_ = 1;
+
+  cl::Kernel kernel_;
+  cl::NDRange local_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+  bool use_lws_{true};
+  bool use_turn_{false};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
index 0c88509926041411eddac66bea08b5d3a08d6a3c..5f7950b060ac77b7d28053ef209c26b9bd9cf24f 100644
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
@@ -44,8 +44,10 @@ class DepthwiseConv2dCompute
       build_options_ += " -DRELU6";
     }
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/depthwise_conv2d_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/depthwise_conv2d_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -67,7 +69,7 @@ class DepthwiseConv2dCompute
         param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     cl_int status;
@@ -106,6 +108,7 @@ class DepthwiseConv2dCompute
     status = kernel.setArg(++arg_idx, *bias_buf);
     CL_CHECK_FATAL(status);
     auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -120,7 +123,8 @@ class DepthwiseConv2dCompute
  private:
   std::string kernel_func_name_{"depthwise_conv2d"};
   std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/dropout_image_compute.cc b/lite/kernels/opencl/dropout_image_compute.cc
index 490e34a8868a3f625591a1c621aa297bb0639576..27c7ebaa5a3f2abee2fc58cf3e137fe250ddd6bf 100644
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
@@ -40,8 +40,10 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/dropout_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/dropout_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -63,7 +65,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     cl_int status;
 
@@ -87,6 +89,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size.data()[1]),
                     static_cast<cl::size_type>(default_work_size.data()[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -101,7 +104,8 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"dropout"};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.cc b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
index 3961ac7583917fdcd761614558c493e6917d3294..f8ae61bacbba9a9595b96435e47d36107d8fc74a 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
@@ -25,8 +25,10 @@ namespace opencl {
 
 void ElementwiseAddCompute::PrepareForRun() {
   auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
+  context.cl_context()->AddKernel(kernel_func_name_,
+                                  "buffer/elementwise_add_kernel.cl",
+                                  build_options_,
+                                  time_stamp_);
   ele_param_ = param_.get_mutable<param_t>();
   UpdateParams();
 }
@@ -39,7 +41,7 @@ void ElementwiseAddCompute::Run() {
   auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
       TARGET(kOpenCL));
   STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
+  kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << TargetToStr(ele_param_->X->target());
@@ -61,6 +63,7 @@ void ElementwiseAddCompute::Run() {
   CL_CHECK_FATAL(status);
 
   auto global_work_size = cl::NDRange{channels_, batch_};
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.h b/lite/kernels/opencl/elementwise_add_buffer_compute.h
index 5a9266ee69b81416d5f4dea9a3eb38aaed7b4165..4a26e283fd02a4f7c4f7ade20de79a3fe7838019 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 #include "lite/core/kernel.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
 
@@ -46,7 +47,8 @@ class ElementwiseAddCompute
   param_t* ele_param_{nullptr};
   std::string kernel_func_name_{"elementwise_add"};
   std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc
index 6d0ebf638f0a8967e27a657131e1cac89967ee0b..3b848954439d95eaa39616a22e6c6af67dc7d5fa 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -23,44 +23,84 @@ namespace lite {
 namespace kernels {
 namespace opencl {
 
-void ElementwiseAddImageCompute::PrepareForRun() {
-  ele_param_ = param_.get_mutable<param_t>();
-  auto* x = ele_param_->X;
-  auto* y = ele_param_->Y;
-  auto axis = ele_param_->axis;
+void ElementwiseAddImageCompute::PrepareForRun() {}
 
-  if (y->dims().size() == 4) {
-    kernel_func_name_ = "elementwise_add";  // y: ImageDefault
-  } else if (y->dims().size() == 1) {
-    if (axis == x->dims().size() - 1) {
-      kernel_func_name_ = "width_add";  // y: ImageDefault
-    } else if (axis == x->dims().size() - 3) {
-      kernel_func_name_ = "channel_add";  // y: ImageFolder
+void ElementwiseAddImageCompute::ReInitWhenNeeded() {
+  ele_param_ = param_.get_mutable<param_t>();
+  auto x_dims = ele_param_->X->dims();
+  if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+      first_epoch_for_reinit_) {
+    last_x_dims_ = x_dims;
+    first_epoch_for_reinit_ = false;
+
+    // choose kernel
+    auto* x = ele_param_->X;
+    auto* y = ele_param_->Y;
+    auto* out = ele_param_->Out;
+    auto axis = ele_param_->axis;
+
+    if (y->dims().size() == 4) {
+      kernel_func_name_ = "elementwise_add";  // y: ImageDefault
+    } else if (y->dims().size() == 1) {
+      if (axis == x->dims().size() - 1) {
+        kernel_func_name_ = "width_add";  // y: ImageDefault
+      } else if (axis == x->dims().size() - 3) {
+        kernel_func_name_ = "channel_add";  // y: ImageFolder
+      } else {
+        LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+                   << ", x->dims().size():" << x->dims().size()
+                   << ", y->dims.size():" << y->dims().size();
+      }
     } else {
       LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
                  << ", x->dims().size():" << x->dims().size()
                  << ", y->dims.size():" << y->dims().size();
     }
-  } else {
-    LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-               << ", x->dims().size():" << x->dims().size()
-               << ", y->dims.size():" << y->dims().size();
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/elementwise_add_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+
+    // compute image shape
+    paddle::lite::CLImageConverterDefault default_convertor;
+    x_img_shape_ = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
+    y_img_shape_ = default_convertor.InitImageDimInfoWith(y->dims());
+    out_img_shape_ =
+        default_convertor.InitImageDimInfoWith(out->dims());  // w, h
+
+    // compute global work size
+    GetGlobalWorkSize();
   }
-  VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+}
 
-  auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+void ElementwiseAddImageCompute::GetGlobalWorkSize() {
+  global_work_size_ = cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
+                                  static_cast<cl::size_type>(x_img_shape_[1])};
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "global_work_size:[2D]:" << x_img_shape_[0] << " "
+          << x_img_shape_[1];
+#endif
 }
 
 void ElementwiseAddImageCompute::Run() {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-
   auto* x = ele_param_->X;
   auto* y = ele_param_->Y;
   auto* out = ele_param_->Out;
   auto axis = ele_param_->axis;
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+
+  auto* x_img = x->data<half_t, cl::Image2D>();
+  auto* y_img = y->data<half_t, cl::Image2D>();
+  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
+                                                         out_img_shape_[1]);
 
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "x->target():" << TargetToStr(x->target());
@@ -70,75 +110,54 @@ void ElementwiseAddImageCompute::Run() {
   VLOG(4) << "y->dims():" << y->dims();
   VLOG(4) << "out->dims():" << out->dims();
   VLOG(4) << "axis:" << axis;
-#endif
-
-  paddle::lite::CLImageConverterDefault default_convertor;
-  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
-  auto x_img_width = x_img_shape[0];
-  auto x_img_height = x_img_shape[1];
-  auto out_img_shape =
-      default_convertor.InitImageDimInfoWith(out->dims());  // w, h
-  auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
 
-  auto* x_img = x->data<half_t, cl::Image2D>();
-  auto* y_img = y->data<half_t, cl::Image2D>();
-  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
-                                                         out_img_shape[1]);
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
-  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
-  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
-          << out_img_shape[1];
+  VLOG(4) << "x_img_shape_[w,h]:" << x_img_shape_[0] << " " << x_img_shape_[1];
+  VLOG(4) << "y_img_shape_[w,h]:" << y_img_shape_[0] << " " << y_img_shape_[1];
+  VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
+          << out_img_shape_[1];
 #endif
 
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  int arg_idx = 0;
-  auto y_dims = y->dims();
+  cl_int status;
+  auto kernel = kernel_;
   if (y_dims.size() == 4) {
-    cl_int status = kernel.setArg(arg_idx, *x_img);
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *y_img);
+    status = kernel.setArg(1, *y_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(2, *out_img);
     CL_CHECK_FATAL(status);
   } else if (y_dims.size() == 1) {
-    if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
-      int tensor_w = x->dims()[x->dims().size() - 1];
+    if (axis == x_dims.size() - 1 || axis == x_dims.size() - 3) {
+      const int tensor_w = x_dims[x_dims.size() - 1];
 #ifndef LITE_SHUTDOWN_LOG
       VLOG(4) << "tensor_w:" << tensor_w;
 #endif
-      cl_int status = kernel.setArg(arg_idx, *x_img);
+      status = kernel.setArg(0, *x_img);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *y_img);
+      status = kernel.setArg(1, *y_img);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_img);
+      status = kernel.setArg(2, *out_img);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      status = kernel.setArg(3, tensor_w);
       CL_CHECK_FATAL(status);
     } else {
       LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-                 << ", x->dims().size():" << x->dims().size()
-                 << ", y->dims.size():" << y->dims().size();
+                 << ", x->dims().size():" << x_dims.size()
+                 << ", y->dims.size():" << y_dims.size();
     }
   } else {
     LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-               << ", x->dims().size():" << x->dims().size()
-               << ", y->dims.size():" << y->dims().size();
+               << ", x->dims().size():" << x_dims.size()
+               << ", y->dims.size():" << y_dims.size();
   }
 
-  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
-                                      static_cast<cl::size_type>(x_img_height)};
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
-#endif
-  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.h b/lite/kernels/opencl/elementwise_add_image_compute.h
index 084f0fe7fb18f9abe3c6ef41f10a9e38e31a54fc..196e3c499e700022c56f8cae919c67235e7b09db 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
@@ -15,8 +15,10 @@
 
 #include <memory>
 #include <string>
+#include <vector>
 #include "lite/backends/opencl/cl_half.h"
 #include "lite/core/kernel.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
 
@@ -34,6 +36,10 @@ class ElementwiseAddImageCompute
 
   void PrepareForRun() override;
 
+  void ReInitWhenNeeded() override;
+
+  void GetGlobalWorkSize();
+
   void Run() override;
 
   std::string doc() const override {
@@ -42,9 +48,22 @@ class ElementwiseAddImageCompute
 
  protected:
   param_t* ele_param_{nullptr};
+  DDim last_x_dims_;
+  DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim y_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+
   std::string kernel_func_name_{"elementwise_add"};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  bool first_epoch_for_reinit_{true};
+  cl::Kernel kernel_;
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_mul_compute.cc b/lite/kernels/opencl/elementwise_mul_compute.cc
index d096f05278e7dbc2187dd4aaf1b5e945e5b2f395..19d8cfa03668cbfc7ffb951479ae7d84c1fc03c0 100644
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
@@ -150,7 +150,8 @@ void ElementwiseMulFloatImageCompute::Run() {
 
   auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
                                       static_cast<cl::size_type>(x_img_height)};
-  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
+  auto  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel
       kernel,
       cl::NullRange,
       global_work_size,
diff --git a/lite/kernels/opencl/elementwise_mul_image_compute.cc b/lite/kernels/opencl/elementwise_mul_image_compute.cc
index 78a025566f24cb604910eb3766cb05c8647e1e03..23b0a20ba39b0890ee10dc03b6e80756f5724419 100644
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -44,32 +44,37 @@ class ElementwiseMulImageCompute
     ele_param_ = param_.get_mutable<param_t>();
     auto* y = ele_param_->Y;
     auto* x = ele_param_->X;
-    auto y_dims = y->dims();
+    auto bias_dims = y->dims();
     auto x_dims = x->dims();
-    if (y_dims == x_dims) {
+
+    if (bias_dims == x_dims) {
       kernel_func_name_ = "elementwise_mul";
-    } else if (y_dims.size() == 1) {
-      kernel_func_name_ = "channel_mul_d1";
-    } else if (y_dims.size() == 2) {
-      if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) {
-        kernel_func_name_ = "channel_mul_d2_nc";
+    } else {
+      const int bias_dim_size = bias_dims.size();
+      if (bias_dim_size == 1) {
+        kernel_func_name_ = "channel_mul_d1";
+      } else if (bias_dim_size == 2) {
+        kernel_func_name_ = "channel_mul_d2";
+      } else if (bias_dim_size == 3) {
+        kernel_func_name_ = "channel_mul_d3";
+      } else if (bias_dim_size == 4) {
+        kernel_func_name_ = "channel_mul_d4";
       } else {
-        kernel_func_name_ = "channel_mul_d2_hw";
+        LOG(FATAL) << "Unsupported ElementwiseMul with x_dims:" << x_dims
+                   << " y_dims:" << bias_dims;
       }
-    } else if (y_dims.size() == 4 || x_dims.size() == 4) {
-      kernel_func_name_ = "channel_mul_d4";
-    } else {
-      LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
-                 << y_dims.size()
-                 << ", x_dims.size():" << ele_param_->X->dims().size();
     }
+
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    VLOG(4) << "y_dims:" << y_dims;
-    VLOG(4) << "y_dims.size():" << y_dims.size();
+    VLOG(4) << "x_dims:" << x_dims;
+    VLOG(4) << "bias_dims:" << bias_dims;
+    VLOG(4) << "bias_dims.size():" << bias_dims.size();
 
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/elementwise_mul_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -111,87 +116,76 @@ class ElementwiseMulImageCompute
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
-    int arg_idx = 0;
-    auto y_dims = y->dims();
+    auto bias_dims = y->dims();
     auto x_dims = x->dims();
-    if (y_dims == x_dims) {
-      // kernel: elementwise_mul(channel_mul_d4)
-      cl_int status = kernel.setArg(arg_idx, *x_img);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *y_img);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_img);
-      CL_CHECK_FATAL(status);
-    } else if (y_dims.size() == 1 || y_dims.size() == 4) {
-      auto tensor_w = x_dims[x_dims.size() - 1];
-#ifndef LITE_SHUTDOWN_LOG
-      VLOG(4) << "tensor_w:" << tensor_w;
-#endif
-      // kernel: channel_mul_d1 / channel_mul_d4
-      cl_int status = kernel.setArg(arg_idx, *x_img);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *y_img);
+
+    if (bias_dims == x_dims) {
+      // kernel_func_name_ = "elementwise_mul";
+      cl_int status = kernel.setArg(0, *x_img);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_img);
+      status = kernel.setArg(1, *y_img);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      status = kernel.setArg(2, *out_img);
       CL_CHECK_FATAL(status);
-    } else if (y_dims.size() == 2) {
-      if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) {
-        auto tensor_w = x_dims[x_dims.size() - 1];
-#ifndef LITE_SHUTDOWN_LOG
-        VLOG(4) << "tensor_w:" << tensor_w;
-#endif
-        // kernel: channel_mul_d2_nc
-        cl_int status = kernel.setArg(arg_idx, *x_img);
+    } else {
+      const int bias_dim_size = bias_dims.size();
+      if (bias_dim_size == 1) {
+        // kernel_func_name_ = "channel_mul_d1";
+        const int tensor_w = x_dims[x_dims.size() - 1];
+        cl_int status = kernel.setArg(0, *x_img);
         CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, *y_img);
+        status = kernel.setArg(1, *y_img);
         CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, *out_img);
+        status = kernel.setArg(2, *out_img);
         CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+        status = kernel.setArg(3, tensor_w);
         CL_CHECK_FATAL(status);
-      } else {
-        auto y_tensor_h = y->dims()[0];
-        auto y_tensor_w = y->dims()[1];
-#ifndef LITE_SHUTDOWN_LOG
-        VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h;
-#endif
-        // kernel: channel_mul_d2_hw
-        cl_int status = kernel.setArg(arg_idx, *x_img);
+      } else if (bias_dim_size == 2) {
+        // kernel_func_name_ = "channel_mul_d2";
+        const int tensor_w = x_dims[x_dims.size() - 1];
+        cl_int status = kernel.setArg(0, *x_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(1, *y_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(2, *out_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(3, tensor_w);
+        CL_CHECK_FATAL(status);
+      } else if (bias_dim_size == 3) {
+        // kernel_func_name_ = "channel_mul_d3";
+        const int tensor_w = x_dims[x_dims.size() - 1];
+        cl_int status = kernel.setArg(0, *x_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(1, *y_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(2, *out_img);
         CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, *y_img);
+        status = kernel.setArg(3, tensor_w);
         CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, *out_img);
+      } else if (bias_dim_size == 4) {
+        // kernel_func_name_ = "channel_mul_d4";
+        const int tensor_w = x_dims[x_dims.size() - 1];
+        cl_int status = kernel.setArg(0, *x_img);
         CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_w));
+        status = kernel.setArg(1, *y_img);
         CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
+        status = kernel.setArg(2, *out_img);
         CL_CHECK_FATAL(status);
+        status = kernel.setArg(3, tensor_w);
+        CL_CHECK_FATAL(status);
+      } else {
+        LOG(FATAL) << "Unsupported ElementwiseMul with x_dims:" << x_dims
+                   << " y_dims:" << bias_dims;
       }
-    } else if (x_dims.size() == 4) {
-      auto tensor_w = y_dims[y_dims.size() - 1];
-      VLOG(4) << "tensor_w:" << tensor_w;
-      // kernel: channel_mul_d4
-      cl_int status = kernel.setArg(arg_idx, *y_img);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *x_img);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_img);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
-      CL_CHECK_FATAL(status);
-    } else {
-      LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
-                 << y_dims.size();
     }
 
     auto global_work_size =
         cl::NDRange{static_cast<cl::size_type>(x_img_width),
                     static_cast<cl::size_type>(x_img_height)};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -210,7 +204,8 @@ class ElementwiseMulImageCompute
   param_t* ele_param_{nullptr};
   std::string kernel_func_name_{"elementwise_mul"};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.cc b/lite/kernels/opencl/elementwise_sub_image_compute.cc
index 0bc867d7f124582660b7a0a9a95d026d910fc2d3..33cb55b6966bb7e04070289614ac83cc898f05c4 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
@@ -49,8 +49,10 @@ void ElementwiseSubImageCompute::PrepareForRun() {
   VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
 
   auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "image/elementwise_sub_kernel.cl", build_options_);
+  context.cl_context()->AddKernel(kernel_func_name_,
+                                  "image/elementwise_sub_kernel.cl",
+                                  build_options_,
+                                  time_stamp_);
 }
 
 void ElementwiseSubImageCompute::Run() {
@@ -93,7 +95,7 @@ void ElementwiseSubImageCompute::Run() {
 #endif
 
   STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
+  kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
   int arg_idx = 0;
@@ -136,6 +138,7 @@ void ElementwiseSubImageCompute::Run() {
   VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
 #endif
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.h b/lite/kernels/opencl/elementwise_sub_image_compute.h
index 48386b083e5375f8943c04afb1da70a2ed207dbf..29507a9775aafe202bcdf58187966317a1902ff3 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
@@ -17,6 +17,7 @@
 #include <string>
 #include "lite/backends/opencl/cl_half.h"
 #include "lite/core/kernel.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
 
@@ -44,7 +45,8 @@ class ElementwiseSubImageCompute
   param_t* ele_param_{nullptr};
   std::string kernel_func_name_{"elementwise_sub"};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc
index dbdedd136ea6b8c6b06d02d4f6d893e4ea849e8a..1e24020f2ad3f4a6f8dda4348c9a291b7a04868f 100644
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
@@ -30,74 +31,99 @@ class FcCompute
  public:
   using param_t = operators::FcParam;
 
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto x_dims = param.input->dims();
-    const auto w_dims = param.w->dims();
-
-    CHECK_GE(x_dims.size(), 2UL);
-    CHECK_GE(w_dims.size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
-
-    m_ = x_dims.Slice(0, param.in_num_col_dims).production();
-    k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
-    n_ = w_dims[1];
-    CHECK_EQ(k_, static_cast<int>(w_dims[0]));
-    VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
-            << " " << x_dims[3];
-    VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
-            << " " << w_dims[3];
-    VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
+  void PrepareForRun() override {}
 
+  void ReInitWhenNeeded() override {
+    fc_param_ = param_.get_mutable<param_t>();
+    const auto x_dims = fc_param_->input->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute m,n,k
+      const auto w_dims = fc_param_->w->dims();
+      CHECK_GE(x_dims.size(), 2UL);
+      CHECK_GE(w_dims.size(), 2UL);
+      CHECK_EQ(fc_param_->output->dims().size(), 2UL);
+
+      m_ = x_dims.Slice(0, fc_param_->in_num_col_dims).production();
+      k_ = x_dims.Slice(fc_param_->in_num_col_dims, x_dims.size()).production();
+      n_ = w_dims[1];
+      CHECK_EQ(k_, static_cast<int>(w_dims[0]));
+
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
+              << " " << x_dims[3];
+      VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
+              << " " << w_dims[3];
+      VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
+#endif
+
+      // choose kernel
+      if (m_ == 1) {  // gemv
+        kernel_func_name_ = "fc_gemv_1x4";
+      } else {  // gemm
+        kernel_func_name_ = "fc_gemm_4x4";
+      }
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+#endif
+
+      if (fc_param_->activation_type == "relu") {
+        build_options_ += "-DRELU";
+      }
+
+      auto& context = ctx_->As<OpenCLContext>();
+      context.cl_context()->AddKernel(kernel_func_name_,
+                                      "buffer/fc_kernel.cl",
+                                      build_options_,
+                                      time_stamp_);
+      STL::stringstream kernel_key;
+      kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+      kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+
+  void GetGlobalWorkSize() {
     if (m_ == 1) {  // gemv
-      kernel_func_name_ = "fc_gemv_1x4";
       global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
     } else {  // gemm
-      kernel_func_name_ = "fc_gemm_4x4";
       global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
                                       static_cast<size_t>((n_ + 3) / 4)};
     }
-    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-
-    if (param.activation_type == "relu") {
-      build_options_ += "-DRELU";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
   }
 
   void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* x_buf = param.input->data<float, cl::Buffer>();
-    auto* w_buf = param.w->data<float, cl::Buffer>();
-    auto* bias_buf = param.bias->data<float, cl::Buffer>();
+    auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
+    auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
+    auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
     auto* out_buf =
-        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+        fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
+    auto kernel = kernel_;
     cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, *x_buf);
+    status = kernel.setArg(0, *x_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *w_buf);
+    status = kernel.setArg(1, *w_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *bias_buf);
+    status = kernel.setArg(2, *bias_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
+    status = kernel.setArg(3, *out_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(m_));
+    status = kernel.setArg(4, static_cast<const int>(m_));
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(n_));
+    status = kernel.setArg(5, static_cast<const int>(n_));
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(k_));
+    status = kernel.setArg(6, static_cast<const int>(k_));
     CL_CHECK_FATAL(status);
 
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -111,10 +137,15 @@ class FcCompute
 
  private:
   int m_, n_, k_;
+  param_t* fc_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_float "};
+  std::string time_stamp_{GetTimeStamp()};
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
   cl::NDRange global_work_size_;
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  cl::Kernel kernel_;
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc b/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
index d76e00fa85d4ebb6da9d779e9c2b220a2fd731d9..730b70525e818512aea11e1f42c1282b125aae54 100644
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
@@ -28,8 +28,10 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
   void PrepareForRun() override {
     build_options_ += " -DRELU";
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/elementwise_add_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     ele_param_ = param_.get_mutable<param_t>();
     UpdateParams();
     auto act_t = static_cast<param_t*>(ele_param_)->act_type;
diff --git a/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc b/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
index e5c0e29bddf5cd6c25ccf98f05aa7cb091a4be7e..8e687340943dcb0f1b68e4c9495cbab1ad703645 100644
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/opencl/elementwise_add_image_compute.h"
+#include "lite/kernels/opencl/image_helper.h"
 
 namespace paddle {
 namespace lite {
@@ -30,8 +31,10 @@ class FusionElementwiseAddActivationImageCompute
   void PrepareForRun() override {
     build_options_ += " -DRELU";
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/elementwise_add_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     ele_param_ = param_.get_mutable<param_t>();
     auto act_t = static_cast<param_t*>(ele_param_)->act_type;
     VLOG(4) << "act: " << act_t;
diff --git a/lite/kernels/opencl/grid_sampler_image_compute.cc b/lite/kernels/opencl/grid_sampler_image_compute.cc
index 243737a81331a7159834d30ccfb2fab181baeebe..0d2cc348960ff5ef6412bf58dd7ce9a4f2ecc19d 100644
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -39,97 +39,122 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
   }
 
   void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/grid_sampler_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << "kernel_key: " << kernel_key.str();
+  }
+
+  void ReInitWhenNeeded() override {
     grid_param_ = param_.get_mutable<param_t>();
+    auto x_dims = grid_param_->x->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      out_img_shape_ =
+          default_convertor.InitImageDimInfoWith(grid_param_->out->dims());
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
 
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_);
-    VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
+  void GetGlobalWorkSize() {
+    auto default_work_size =
+        DefaultWorkSize(grid_param_->out->dims(),
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_img_shape_[0]),
+                            static_cast<int64_t>(out_img_shape_[1])}));
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                    static_cast<cl::size_type>(default_work_size[1]),
+                    static_cast<cl::size_type>(default_work_size[2] / 4)};
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+    VLOG(4) << "global_work_size_:[2D]:" << global_work_size_[0] << " "
+            << global_work_size_[1] << " " << global_work_size_[2];
+#endif
   }
 
   void Run() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-
     auto* x = grid_param_->x;
-    auto* out = grid_param_->out;
     auto* grid = grid_param_->grid;
+    auto* out = grid_param_->out;
+
     auto out_dims = out->dims();
-    auto in_dims = x->dims();
+    int out_height = out_dims[2];
+    int out_width = out_dims[3];
+
+    auto* x_img = x->data<half_t, cl::Image2D>();
+    auto* grid_img = x->data<half_t, cl::Image2D>();
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
+                                                           out_img_shape_[1]);
 
 #ifndef LITE_SHUTDOWN_LOG
+    auto in_dims = x->dims();
     VLOG(4) << "x->target():" << TargetToStr(x->target());
     VLOG(4) << "out->target():" << TargetToStr(out->target());
     VLOG(4) << "x->dims():" << in_dims;
     VLOG(4) << "out->dims():" << out_dims;
-#endif
-
-    auto out_image_shape = InitImageDimInfoWith(out_dims);
-    auto* x_img = x->data<half_t, cl::Image2D>();
     // VLOG(4) << "x_image: " << x_img;
-
-    auto* grid_img = x->data<half_t, cl::Image2D>();
     // VLOG(4) << "grid_img: " << grid_img;
-
-    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-#ifndef LITE_SHUTDOWN_LOG
     // VLOG(4) << "out_image" << out_img;
-    VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
-            << out_image_shape["height"];
+    VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
+            << out_img_shape_[1];
 #endif
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
-    int arg_idx = 0;
-    int out_height = out_dims[2];
-    int out_width = out_dims[3];
-    auto default_work_size =
-        DefaultWorkSize(out_dims,
-                        DDim(std::vector<DDim::value_type>{
-                            static_cast<int64_t>(out_image_shape["width"]),
-                            static_cast<int64_t>(out_image_shape["height"])}));
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
-            << default_work_size[1] << ", " << default_work_size[2];
-#endif
-    cl_int status = kernel.setArg(arg_idx++, *x_img);
+    cl_int status;
+    auto kernel = kernel_;
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, *grid_img);
+    status = kernel.setArg(1, *grid_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, *out_img);
+    status = kernel.setArg(2, *out_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, out_height);
+    status = kernel.setArg(3, out_height);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, out_width);
+    status = kernel.setArg(4, out_width);
     CL_CHECK_FATAL(status);
 
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
-                    static_cast<cl::size_type>(default_work_size[1]),
-                    static_cast<cl::size_type>(default_work_size[2] / 4)};
-
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
-        global_work_size,
+        global_work_size_,
         cl::NullRange,
         nullptr,
         event_.get());
     CL_CHECK_FATAL(status);
     context.cl_wait_list()->emplace(out_img, event_);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
-            << global_work_size[1] << " " << global_work_size[2];
-#endif
   }
 
  protected:
   param_t* grid_param_{nullptr};
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
   std::string kernel_func_name_{"grid_sampler"};
+  cl::Kernel kernel_;
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/image_helper.h b/lite/kernels/opencl/image_helper.h
index d0d282250d1c5658bc8f684b52b4b0d140895833..81d38bc683eb355b1d85a307d35839b4e3e8ef45 100644
--- a/lite/kernels/opencl/image_helper.h
+++ b/lite/kernels/opencl/image_helper.h
@@ -74,6 +74,12 @@ static std::vector<size_t> DefaultWorkSize(const DDim& image_dim,
   LOG(FATAL) << " not support this dim, need imp ";
 }
 
+static const std::string GetTimeStamp() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return std::to_string(time.tv_usec);
+}
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/opencl/instance_norm_image_compute.cc b/lite/kernels/opencl/instance_norm_image_compute.cc
index 176b4149b2656c6197f43336753bc53d5fb18769..41acb5f8d457d047c0396c563006b4b4a31268b8 100644
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
@@ -38,6 +38,118 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     return "InstanceNorm using cl::Image2D(ImageDefault/RGBA), kFP16";
   }
 
+#if 1  // onnx/pytorch version
+  void PrepareForRun() override {
+    instance_norm_param_ = param_.get_mutable<param_t>();
+    auto out = instance_norm_param_->out;
+    auto out_dims = out->dims();
+    const int out_n = out_dims[0];
+    const int out_c = out_dims[1];
+    const int out_h = out_dims[2];
+    const int out_w = out_dims[3];
+    const int c_group = (out_dims[1] + 3) / 4;
+
+    // TODO(ysh329): add instance_norm + relu pass
+    // std::string build_options_ += "-DRELU";
+    if (out_h == 128) {
+      build_options_ += " -DLOCAL_MEM_128";
+    } else if (out_h == 64) {
+      build_options_ += " -DLOCAL_MEM_64";
+    } else if (out_h > 256) {
+      LOG(FATAL) << "Unsupported input height:" << out_h << " of instance norm";
+    }
+
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/instance_norm_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+  }
+
+  void Run() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto* x = instance_norm_param_->x;
+    auto* out = instance_norm_param_->out;
+    auto x_dims = x->dims();
+    auto out_dims = out->dims();
+
+    const int out_n = out_dims[0];
+    const int out_c_group = (out_dims[1] + 3) / 4;
+    const int out_h = out_dims[2];
+    const int out_w = out_dims[3];
+
+    float epsilon = instance_norm_param_->epsilon;
+    auto device_info = CLRuntime::Global()->GetDeviceInfo();
+    int max_work_item_size1 = device_info["CL_DEVICE_MAX_WORK_ITEM_SIZES_1"];
+    int lws0 = 1;
+    int lws1 =
+        std::min(static_cast<int>(max_work_item_size1), std::min(256, out_w));
+    int lws2 = 1;
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(out_n * out_c_group),
+                    static_cast<cl::size_type>(lws1),
+                    static_cast<cl::size_type>(lws2)};
+    auto local_work_size = cl::NDRange{static_cast<cl::size_type>(lws0),
+                                       static_cast<cl::size_type>(lws1),
+                                       static_cast<cl::size_type>(lws2)};
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "global_work_size:" << static_cast<int>(global_work_size[0])
+            << " " << static_cast<int>(global_work_size[1]) << " "
+            << static_cast<int>(global_work_size[2]);
+    VLOG(4) << "local_work_size:" << static_cast<int>(local_work_size[0]) << " "
+            << static_cast<int>(local_work_size[1]) << " "
+            << static_cast<int>(local_work_size[2]);
+    VLOG(4) << "out_w:" << out_w;
+    VLOG(4) << "out_h:" << out_h;
+    VLOG(4) << "out_c_group:" << out_c_group;
+    VLOG(4) << "lws1:" << lws1;
+    VLOG(4) << "lws2:" << lws2;
+    VLOG(4) << "epsilon:" << epsilon;
+#endif
+
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    auto* x_img = x->data<half_t, cl::Image2D>();
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    cl_int status = kernel.setArg(0, out_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(1, out_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(2, out_c_group);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(3, lws1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(4, lws2);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(5, epsilon);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(6, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(7, *out_img);
+    CL_CHECK_FATAL(status);
+
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        local_work_size,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+#else  // paddle version
   void PrepareForRun() override {
     instance_norm_param_ = param_.get_mutable<param_t>();
     auto channel = instance_norm_param_->scale->dims()[0];
@@ -71,15 +183,16 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     bias_image_.mutable_data<half_t, cl::Image2D>(
         scale_img_size[0], scale_img_size[1], bias_img.data());
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/instance_norm_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/instance_norm_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
   void Run() override {
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
-
     auto* x = instance_norm_param_->x;
     auto* out = instance_norm_param_->out;
     auto in_dims = x->dims();
@@ -126,12 +239,11 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     auto* scale_img = scale_image_.data<half_t, cl::Image2D>();
     auto* bias_img = bias_image_.data<half_t, cl::Image2D>();
     float epsilon = instance_norm_param_->epsilon;
-    int arg_idx = 0;
 
     cl_int status = kernel.setArg(arg_idx++, *x_img);
     CL_CHECK_FATAL(status);
@@ -148,6 +260,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     status = kernel.setArg(arg_idx++, in_w);
     CL_CHECK_FATAL(status);
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -158,12 +271,14 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     CL_CHECK_FATAL(status);
     context.cl_wait_list()->emplace(out_img, event_);
   }
+#endif
 
  protected:
   param_t* instance_norm_param_{nullptr};
-  std::string kernel_func_name_{"instance_norm"};
+  std::string kernel_func_name_{"instance_norm_onnx"};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
   Tensor scale_image_;
   Tensor bias_image_;
 };
diff --git a/lite/kernels/opencl/io_copy_buffer_compute.cc b/lite/kernels/opencl/io_copy_buffer_compute.cc
index 6a49cc2577a58690e5e0b6a6ede82df0bdc99bb1..eaabf7f37bf1434e8a451fc797e72d706e68ce5b 100644
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -106,6 +106,7 @@ class IoCopykOpenCLToHostCompute
 
     auto& context = ctx_->As<OpenCLContext>();
     auto* wait_list = context.cl_wait_list();
+
     auto it = wait_list->find(x_ptr);
     if (it != wait_list->end()) {
 #ifndef LITE_SHUTDOWN_LOG
@@ -113,6 +114,8 @@ class IoCopykOpenCLToHostCompute
 #endif
       auto& event = *(it->second);
       event.wait();
+      auto command_queue = CLRuntime::Global()->command_queue();
+      command_queue.finish();
     } else {
       LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
     }
diff --git a/lite/kernels/opencl/layout_image_compute.cc b/lite/kernels/opencl/layout_image_compute.cc
index 22b3533e123bc248b0ec59df593cd51fe0ad1391..e35cd6e5fb59cfada85fb5beaff758d6262f51b4 100644
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
@@ -122,6 +122,7 @@ class LayoutComputeBufferChwToImageDefault
         cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
                     static_cast<cl::size_type>(new_dims[3]),
                     static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -141,7 +142,7 @@ class LayoutComputeBufferChwToImageDefault
  private:
   std::string kernel_func_name_{"buffer_to_image2d"};
   std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 // [ImageDefault] -> [NCHW]
@@ -229,6 +230,7 @@ class LayoutComputeImageDefaultToBufferChw
         cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
                     static_cast<cl::size_type>(new_dims[3]),
                     static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -248,7 +250,7 @@ class LayoutComputeImageDefaultToBufferChw
  private:
   std::string kernel_func_name_{"image2d_to_buffer"};
   std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 // [NCHW] -> [ImageDW]
@@ -323,6 +325,7 @@ class LayoutComputeBufferChwToImage2DNw
         cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4),  // N blocks
                     static_cast<cl::size_type>(out_W),            // w
                     static_cast<cl::size_type>(out_C * out_H)};   // ch
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -341,7 +344,7 @@ class LayoutComputeBufferChwToImage2DNw
  private:
   std::string kernel_func_name_{"buffer_to_image2d_nw"};
   std::string build_options_{"-DCL_DTYPE_float "};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/lrn_image_compute.cc b/lite/kernels/opencl/lrn_image_compute.cc
index edce0368ddc9cda54fdab44b472fcd0e771413ae..1595987495f4a37ec89a8c9f91e9403c72c45b79 100644
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
@@ -48,7 +48,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
     beta_ = lrn_param_->beta;
     norm_region_ = lrn_param_->norm_region;
     context.cl_context()->AddKernel(
-        kernel_func_name_, "image/lrn_kernel.cl", build_options_);
+        kernel_func_name_, "image/lrn_kernel.cl", build_options_, time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -91,7 +91,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -128,6 +128,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size[1]),
                     static_cast<cl::size_type>(default_work_size[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -152,7 +153,8 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
   std::string norm_region_{"AcrossChannels"};
   std::string kernel_func_name_{"lrn"};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/mul_buffer_compute.cc b/lite/kernels/opencl/mul_buffer_compute.cc
index 4c46da67da9877fb37b214b6d738b3dd3da3e5bb..4ca760b76087112f111f6be71a99c888493c39a1 100644
--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
@@ -32,8 +33,10 @@ class MulCompute
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/mat_mul_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/mat_mul_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     const auto& param = *param_.get_mutable<param_t>();
     const auto* x_data = param.x->data<float>();
     const auto* y_data = param.y->data<float>();
@@ -68,7 +71,7 @@ class MulCompute
         param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     cl_int status;
@@ -88,6 +91,7 @@ class MulCompute
 
     auto global_work_size = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
                                         static_cast<size_t>((n_ + 3) / 4)};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -103,7 +107,8 @@ class MulCompute
   int m_, n_, k_;
   std::string kernel_func_name_{"mat_mul"};
   std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/nearest_interp_image_compute.cc b/lite/kernels/opencl/nearest_interp_image_compute.cc
index 082f21ab1ae792ae33e9e2a368073274258b8884..b61b585e441e9e39ca0fbbec4f7f20c28614df43 100644
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
@@ -38,8 +38,10 @@ class NearestInterpComputeImageDefault
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/nearest_interp_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -66,7 +68,7 @@ class NearestInterpComputeImageDefault
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -107,6 +109,7 @@ class NearestInterpComputeImageDefault
         cl::NDRange{static_cast<cl::size_type>(default_work_size.data()[0]),
                     static_cast<cl::size_type>(default_work_size.data()[1]),
                     static_cast<cl::size_type>(default_work_size.data()[2])};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -121,7 +124,8 @@ class NearestInterpComputeImageDefault
  private:
   std::string kernel_func_name_{"nearest_interp"};
   std::string build_options_{" -DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/pad2d_image_compute.cc b/lite/kernels/opencl/pad2d_image_compute.cc
index 1be4729ee1b24ac77383de4d7c111e9d37d29d6b..a22622af1ee79ffce5ecdee278482e5e96f482cf 100644
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
@@ -52,8 +52,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
     }
 
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/pad2d_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/pad2d_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -93,7 +95,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -140,6 +142,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size[1]),
                     static_cast<cl::size_type>(default_work_size[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -159,7 +162,8 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
   param_t* pad2d_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/pool_buffer_compute.cc b/lite/kernels/opencl/pool_buffer_compute.cc
index 3f491afb86d4e4d5144522b6fb028c225c9a97e4..7de86869ed37940756abde15c825da85924b5b3f 100644
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
@@ -37,8 +37,10 @@ class PoolCompute
     const auto& param = *param_.get_mutable<param_t>();
     kernel_func_name_ += param.pooling_type;
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/pool_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/pool_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -69,7 +71,7 @@ class PoolCompute
     auto* output_buf =
         param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     cl_int status;
     auto numel = out_dims.production();
@@ -103,6 +105,7 @@ class PoolCompute
     status = kernel.setArg(++arg_idx, *output_buf);
     CL_CHECK_FATAL(status);
     auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -117,7 +120,8 @@ class PoolCompute
  private:
   std::string kernel_func_name_{"pool_"};
   std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/pool_image_compute.cc b/lite/kernels/opencl/pool_image_compute.cc
index 39da325ebb10c85f153e349173aa833bbf5e1f6e..83f9107d31cdfa3f73a98e08126b792bde828383 100644
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
@@ -47,7 +47,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
-        kernel_func_name_, "image/pool_kernel.cl", build_options_);
+        kernel_func_name_, "image/pool_kernel.cl", build_options_, time_stamp_);
   }
 
   void Run() override {
@@ -112,7 +112,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     //    VLOG(4) << "out_image" << out_img;
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int c_block = (out_dims[1] + 3) / 4;
@@ -150,6 +150,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
     CL_CHECK_FATAL(status);
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -164,7 +165,8 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"pool_"};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/reshape_image_compute.cc b/lite/kernels/opencl/reshape_image_compute.cc
index 376add226216a57a0868c9c52497b784929a207e..9feffed20461dc49a5d95c7b3092eb195e1e0dc6 100644
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
@@ -36,8 +36,10 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/reshape_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/reshape_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -110,7 +112,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
 #ifndef LITE_SHUTDOWN_LOG
@@ -152,6 +154,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
                     static_cast<size_t>(default_work_size.data()[1]),
                     static_cast<size_t>(default_work_size.data()[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -166,7 +169,8 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"reshape"};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/scale_image_compute.cc b/lite/kernels/opencl/scale_image_compute.cc
index 5fd9a2b46b5ce3b0ad84449785f510d5f0391250..4f5b7f754686eada24a0cc3389e73b06218a0f94 100644
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
@@ -37,53 +37,67 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/scale_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/scale_kernel.cl", build_options_);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
+
+  void ReInitWhenNeeded() override {
+    scale_param_ = param_.get_mutable<param_t>();
+    auto x_dims = scale_param_->x->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      out_img_shape_ =
+          default_convertor.InitImageDimInfoWith(scale_param_->output->dims());
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+
+  void GetGlobalWorkSize() {
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(out_img_shape_[0]),
+                    static_cast<cl::size_type>(out_img_shape_[1])};
   }
 
   void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto& in_dims = param.x->dims();
-    auto* x_img = param.x->data<half_t, cl::Image2D>();
-    const float scale = param.scale;
-    const float bias = param.bias;
-
-    //    LOG(INFO) << "x_image" << x_img;
-    auto out_image_shape = InitImageDimInfoWith(in_dims);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
-            << out_image_shape["height"];
-#endif
-    auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-    //    LOG(INFO) << "out_image" << out_img;
+    auto* x_img = scale_param_->x->data<half_t, cl::Image2D>();
+    auto* out_img = scale_param_->output->mutable_data<half_t, cl::Image2D>(
+        out_img_shape_[0], out_img_shape_[1]);
+    const float scale = scale_param_->scale;
+    const float bias = scale_param_->bias;
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
-                    static_cast<cl::size_type>(out_image_shape["height"])};
 
+    auto kernel = kernel_;
     cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, *x_img);
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(1, *out_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, scale);
+    status = kernel.setArg(2, scale);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, bias);
+    status = kernel.setArg(3, bias);
     CL_CHECK_FATAL(status);
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
-        global_work_size,
+        global_work_size_,
         cl::NullRange,
         nullptr,
         event_.get());
@@ -94,7 +108,17 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"scale"};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
+
+  param_t* scale_param_{nullptr};
+  cl::Kernel kernel_;
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/slice_image_compute.cc b/lite/kernels/opencl/slice_image_compute.cc
index 149ef35afe3d49ca8793769ee7ad366292462296..b9f1da22578a51c69b625af62cac1260f2650ba2 100644
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
@@ -38,8 +38,10 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/slice_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/slice_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -68,7 +70,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     cl_int status;
@@ -94,6 +96,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size.data()[1]),
                     static_cast<cl::size_type>(default_work_size.data()[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -108,7 +111,8 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"slice"};
   std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::string time_stamp_{GetTimeStamp()};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/rknpu/CMakeLists.txt b/lite/kernels/rknpu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ebb432748f363fb6326dc7d06ced5a5238061637
--- /dev/null
+++ b/lite/kernels/rknpu/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(bridges)
+add_kernel(subgraph_compute_rknpu RKNPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_rknpu ${rknpu_subgraph_bridges})
diff --git a/lite/kernels/rknpu/bridges/CMakeLists.txt b/lite/kernels/rknpu/bridges/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4c4801553df8c9bf17eea595fce29206c24aa0cd
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/CMakeLists.txt
@@ -0,0 +1,34 @@
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+
+lite_cc_library(subgraph_bridge_utility_rknpu SRCS utility.cc DEPS ${rknpu_builder_libs} tensor)
+lite_cc_library(subgraph_bridge_graph_rknpu SRCS graph.cc DEPS subgraph_bridge_utility_rknpu)
+
+set(rknpu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_rknpu subgraph_bridge_graph_rknpu)
+
+lite_cc_library(subgraph_bridge_conv_op_rknpu SRCS conv_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_act_op_rknpu SRCS act_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_softmax_op_rknpu SRCS softmax_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_rknpu SRCS pool_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_fc_op_rknpu SRCS fc_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_rknpu SRCS batch_norm_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_rknpu SRCS concat_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_rknpu SRCS elementwise_ops.cc DEPS ${rknpu_subgraph_bridge_deps})
+
+
+set(rknpu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_rknpu
+        subgraph_bridge_graph_rknpu
+        subgraph_bridge_conv_op_rknpu
+        subgraph_bridge_act_op_rknpu
+        subgraph_bridge_softmax_op_rknpu
+        subgraph_bridge_pool_op_rknpu
+        subgraph_bridge_fc_op_rknpu
+        subgraph_bridge_batch_norm_op_rknpu
+        subgraph_bridge_concat_op_rknpu
+        subgraph_bridge_elementwise_ops_rknpu
+        CACHE INTERNAL "rknpu_subgraph_bridges")
+
+message(STATUS "+++++ rknpu_subgraph_bridges: ${rknpu_subgraph_bridges}")
diff --git a/lite/kernels/rknpu/bridges/act_op.cc b/lite/kernels/rknpu/bridges/act_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..decc9b46d653594d7e5eaa53766d43dc841b14b5
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/act_op.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+// #include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  CHECK_EQ(op_type, "relu");
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_var_name)) {
+    x_node = graph->Get(x_var_name);
+  } else {
+    x_node = graph->Add(x_var_name, *x, x_type->precision(), x_type->layout());
+  }
+
+  auto output_node = graph->Add(
+      output_var_name, *output, out_type->precision(), out_type->layout());
+  auto rGraph = graph->GetHandle();
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  outputs.push_back(output_node->data());
+  auto relu =
+      rGraph->AddOperator(rk::nn::OperatorType::RELU, inputs, outputs, nullptr);
+
+  return SUCCESS;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(relu,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ActConverter);
diff --git a/lite/kernels/rknpu/bridges/batch_norm_op.cc b/lite/kernels/rknpu/bridges/batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ad892e3b8073862abede3d01e25e9b51c005631
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/batch_norm_op.cc
@@ -0,0 +1,137 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto scale_name = op_info->Input("Scale").front();
+  auto scale_type = kernel->GetInputDeclType("Scale");
+  CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
+  auto scale = scope->FindMutableTensor(scale_name);
+  auto bias_name = op_info->Input("Bias").front();
+  auto bias_type = kernel->GetInputDeclType("Bias");
+  CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+  auto bias = scope->FindMutableTensor(bias_name);
+  auto mean_name = op_info->Input("Mean").front();
+  auto mean_type = kernel->GetInputDeclType("Mean");
+  CHECK(mean_type->layout() == DATALAYOUT(kNCHW));
+  auto mean = scope->FindMutableTensor(mean_name);
+  auto variance_name = op_info->Input("Variance").front();
+  auto variance_type = kernel->GetInputDeclType("Variance");
+  CHECK(variance_type->layout() == DATALAYOUT(kNCHW));
+  auto variance = scope->FindMutableTensor(variance_name);
+  auto y_name = op_info->Output("Y").front();
+  auto y_type = kernel->GetOutputDeclType("Y");
+  auto y = scope->FindMutableTensor(y_name);
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  float momentum = op_info->GetAttr<float>("momentum");
+  float epsilon = op_info->GetAttr<float>("epsilon");
+  int mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
+  bool use_global_stats = op_info->GetAttr<bool>("use_global_stats");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Scale, Bias, Mean, Variance node
+  auto scale_node = graph->Add(scale_name, *scale);
+  auto bias_node = graph->Add(bias_name, *bias);
+  auto mean_node = graph->Add(mean_name, *mean);
+  auto variance_node = graph->Add(variance_name, *variance);
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    y->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(y_name, *y, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  inputs.push_back(mean_node->data());
+  inputs.push_back(variance_node->data());
+  inputs.push_back(scale_node->data());
+  inputs.push_back(bias_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::BatchNormAttr attrs;
+  attrs.eps = epsilon;
+
+  auto rGraph = graph->GetHandle();
+  auto bn = rGraph->AddOperator(
+      rk::nn::OperatorType::BATCH_NORM, inputs, outputs, &attrs);
+
+  return SUCCESS;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::BatchNormConverter);
diff --git a/lite/kernels/rknpu/bridges/concat_op.cc b/lite/kernels/rknpu/bridges/concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..382d7c3a6038cd2bd0998debf157ee494f24de91
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/concat_op.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " << op_type << " ... ";
+
+  // Get input and output vars and op attributes
+  auto x_names = op_info->Input("X");
+  auto x_type = kernel->GetInputDeclType("X");
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  auto output = scope->FindMutableTensor(out_name);
+
+  auto axis = op_info->GetAttr<int>("axis");
+  auto num = x_names.size();
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // Traverse all of input nodes which are added into the new created concat
+  // node
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  int idx = 1;
+  for (auto& x_name : x_names) {
+    auto x = scope->FindMutableTensor(x_name);
+    auto x_dims = x->dims();
+    std::shared_ptr<Node> x_node = nullptr;
+    if (graph->Has(x_name)) {
+      x_node = graph->Get(x_name);
+    } else {
+      x_node = graph->Add(x_name, *x);
+      QuantizationInfo qnt;
+      qnt.enable_int8 = enable_int8;
+
+      if (enable_int8) {
+        qnt.quant_bits = bit_length;
+        qnt.scale.push_back(input_scale);
+        x->mutable_data<int8_t>();
+      }
+      x_node =
+          graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
+    }
+
+    inputs.push_back(x_node->data());
+    idx++;
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+  outputs.push_back(output_node->data());
+
+  rk::nn::ConcatAttr attrs;
+  attrs.axis = axis;
+
+  auto rGraph = graph->GetHandle();
+  auto concat = rGraph->AddOperator(
+      rk::nn::OperatorType::CONCAT, inputs, outputs, &attrs);
+
+  return SUCCESS;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ConcatConverter);
diff --git a/lite/kernels/rknpu/bridges/conv_op.cc b/lite/kernels/rknpu/bridges/conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d474f0ef10771b1f8a0fdc6c3446c97eff261ec
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/conv_op.cc
@@ -0,0 +1,292 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include <algorithm>
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " << op_type << "... ";
+
+  // Get input and output vars and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+  auto output_name = op_info->Output("Output").front();
+  auto output = scope->FindMutableTensor(output_name);
+  auto output_dims = output->dims();
+  auto bs = input_dims[0];
+  auto ic = input_dims[1];
+  auto oc = filter_dims[0];
+  CHECK_EQ(input_dims.size(), 4L);
+  CHECK_EQ(output_dims.size(), 4L);
+  CHECK_EQ(filter_dims.size(), 4L);
+  CHECK_EQ(output_dims[0], bs);
+  CHECK_EQ(output_dims[1], oc);
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+  // Check depthwise mode
+  bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1);
+  auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // // Input node
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.clear();
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = bit_length;
+    }
+    input_node =
+        graph->Add(input_name, *input, input->precision(), layout, qnt);
+  }
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[NPU] Paddings size should be the same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+  // Filter node
+  std::shared_ptr<Node> filter_node = nullptr;
+  QuantizationInfo filter_qnt;
+
+  filter_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    filter_qnt.scale = weight_scale;
+    filter_qnt.quant_bits = bit_length;
+  }
+
+  filter_node =
+      graph->Add(filter_name, *filter, filter->precision(), layout, filter_qnt);
+
+  // Add bias node if exists bias
+  // Supports the bias nodes with the following dimensions
+  // 0: {oc}
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      auto output_data_size = output_dims.production();
+      std::vector<int64_t> bias_shape;
+
+      if (bias_data_size == oc) {
+        // 0: {oc}
+        bias_shape = {oc};
+      } else {
+        LOG(WARNING)
+            << "[RKNPU] Bias dimension " << bias_dims
+            << " isn't supported in conv2d Op when output dimension is "
+            << output_dims;
+        return FAILED;
+      }
+
+      if (enable_int8) {
+        auto bias_name_qnt = bias_name + "/qnt";
+        auto* bias_qnt = scope->NewTensor(bias_name_qnt);
+
+        bias_qnt->Resize(bias_shape);
+        bias_qnt->set_persistable(true);
+        bias_qnt->set_precision(PrecisionType::kInt32);
+
+        auto* bias_qnt_data = bias_qnt->mutable_data<int32_t>();
+        auto* bias_data = bias->mutable_data<float>();
+
+        QuantizationInfo qnt;
+        qnt.enable_int8 = enable_int8;
+        qnt.quant_bits = 32;
+
+        qnt.scale.resize(weight_scale.size());
+        for (int i = 0; i < weight_scale.size(); i++) {
+          qnt.scale[i] = input_scale * weight_scale[i];
+        }
+
+        auto dtype_max = static_cast<int>((1 << (qnt.quant_bits - 1)) - 1);
+        auto dtype_min = static_cast<int>(0 - dtype_max);
+
+        for (int i = 0; i < oc; i++) {
+          bias_qnt_data[i] =
+              std::min(std::max(static_cast<int>(bias_data[i] / qnt.scale[i]),
+                                dtype_min),
+                       dtype_max);
+        }
+
+        bias_node = graph->Add(
+            bias_name, *bias_qnt, bias_qnt->precision(), layout, qnt);
+      } else {
+        bias_node = graph->Add(bias_name, *bias, bias_shape);
+      }
+    }
+  } else {
+    auto bias_name = filter_name + "/bias/dummy";
+    auto* bias = scope->NewTensor(bias_name);
+    std::vector<int64_t> bias_shape = {oc};
+
+    bias->Resize(bias_shape);
+    bias->set_persistable(true);
+
+    if (enable_int8) {
+      bias->set_precision(PrecisionType::kInt32);
+      auto* bias_data = bias->mutable_data<int32_t>();
+
+      for (int i = 0; i < oc; i++) {
+        bias_data[i] = 0;
+      }
+
+      QuantizationInfo qnt;
+      qnt.enable_int8 = enable_int8;
+      qnt.quant_bits = 32;
+      qnt.scale.resize(weight_scale.size());
+      for (int i = 0; i < weight_scale.size(); i++) {
+        qnt.scale[i] = input_scale * weight_scale[i];
+      }
+
+      bias_node = graph->Add(bias_name, *bias, bias->precision(), layout, qnt);
+    } else {
+      bias->set_precision(PrecisionType::kFloat);
+      auto* bias_data = bias->mutable_data<float>();
+
+      for (int i = 0; i < oc; i++) {
+        bias_data[i] = 0.0;
+      }
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
+    }
+  }
+
+  // Conv node
+  std::shared_ptr<Node> conv_node = nullptr;
+  std::shared_ptr<Node> output_node = nullptr;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(output_name, *output, precision, layout, output_qnt);
+
+  inputs.push_back(input_node->data());
+  inputs.push_back(filter_node->data());
+  inputs.push_back(bias_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::Conv2DAttr attr;
+  attr.ksize[0] = filter_dims[2];
+  attr.ksize[1] = filter_dims[3];
+  attr.stride[0] = strides[0];
+  attr.stride[1] = strides[1];
+  attr.pad[0] = paddings[0];
+  attr.pad[1] = paddings[1];
+  attr.pad[2] = paddings[2];
+  attr.pad[3] = paddings[3];
+  attr.group = groups;
+  attr.weights = oc;
+  attr.dilation[0] = dilations[0];
+  attr.dilation[1] = dilations[1];
+  attr.pad_type = rk::nn::PadType::AUTO;
+  attr.has_relu = fuse_relu;
+
+  if (is_depthwise_mode) {
+    attr.multiplier = 1;
+  } else {
+    attr.multiplier = 0;
+  }
+
+  auto rGraph = graph->GetHandle();
+  auto conv = rGraph->AddOperator(
+      rk::nn::OperatorType::CONV2D, inputs, outputs, &attr, output_name);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ConvConverter);
diff --git a/lite/kernels/rknpu/bridges/elementwise_ops.cc b/lite/kernels/rknpu/bridges/elementwise_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dbd1f9ccb2a49115a9a0fc6d51ad4537cac253ed
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/elementwise_ops.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+std::vector<int64_t> CvtYShape(const DDim& x_dims,
+                               const DDim& y_dims,
+                               int axis) {
+  CHECK_EQ(x_dims.size(), 4UL) << "[RKNPU] Only support 4-dimension x";
+  CHECK_GE(x_dims.size(), y_dims.size());
+
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+
+  std::vector<int64_t> y_new_shape(y_dims.Vectorize());
+  if (y_new_shape.size() == 4UL) {
+    return y_new_shape;
+  }
+  for (int i = 0; i < axis; i++) {
+    y_new_shape.insert(y_new_shape.begin(), 1);
+  }
+  while (y_new_shape.size() < 4) {
+    y_new_shape.push_back(1);
+  }
+  CHECK_EQ(y_new_shape.size(), 4UL);
+  return y_new_shape;
+}
+
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto y_name = op_info->Input("Y").front();
+  auto y_type = kernel->GetInputDeclType("Y");
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  auto output = scope->FindMutableTensor(out_name);
+  auto axis = op_info->GetAttr<int>("axis");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.clear();
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = op_info->GetAttr<int>("bit_length");
+    }
+    x_node = graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
+  }
+
+  // Y node
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+  } else {
+    // auto y_new_shape = CvtYShape(x_dims, y_dims, axis);
+    // y_node = graph->Add(y_name, *y, y_new_shape);
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.quant_bits = bit_length;
+      qnt.scale.clear();
+      qnt.scale.push_back(input_scale);
+    }
+    y_node = graph->Add(y_name, *y, y_type->precision(), y_type->layout(), qnt);
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.clear();
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(
+      out_name, *output, x_type->precision(), x_type->layout(), output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  inputs.push_back(y_node->data());
+  outputs.push_back(output_node->data());
+
+  auto rGraph = graph->GetHandle();
+
+  // Elementwise node
+  if (op_type == "elementwise_add") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::ADD, inputs, outputs, nullptr);
+  } else if (op_type == "elementwise_sub") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::SUBTRACT, inputs, outputs, nullptr);
+  } else if (op_type == "elementwise_mul") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::MULTIPLY, inputs, outputs, nullptr);
+  } else if (op_type == "elementwise_div") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::DIVIDE, inputs, outputs, nullptr);
+  } else {
+    LOG(WARNING) << "[RKNPU] Unsupported op type: " << op_type;
+    return FAILED;
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_div,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
diff --git a/lite/kernels/rknpu/bridges/fc_op.cc b/lite/kernels/rknpu/bridges/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ef548ed222a69bbc8c116e4146c0a0cea128e81a
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/fc_op.cc
@@ -0,0 +1,247 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  auto input_name = op_info->Input("Input").front();
+  auto input_type = kernel->GetInputDeclType("Input");
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  CHECK_GE(input_dims.size(), 2UL);
+  auto w_name = op_info->Input("W").front();
+  auto w_type = kernel->GetInputDeclType("W");
+  auto w = scope->FindMutableTensor(w_name);
+  auto w_dims = w->dims();
+  CHECK_EQ(w_dims.size(), 2UL);
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  auto output = scope->FindMutableTensor(out_name);
+  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  int m = input_dims.Slice(0, in_num_col_dims).production();
+  int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
+  int n = w_dims[1];
+  CHECK_EQ(k * n, w_dims.production());
+  VLOG(3) << "[RKNPU] input dims: " << input_dims << " w dims: " << w_dims
+          << " m: " << m << " k: " << k << " n: " << n;
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // Create input node and reshape it to (m, k, 1, 1)
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+
+  // Create w const node, set its shape to (n, k) and fill with
+  // the transposed w tensor
+  auto* transpose_w = scope->NewTensor(w_name + "/transpose");
+  std::shared_ptr<Node> trans_w_node = nullptr;
+  transpose_w->Resize({n, k});
+  transpose_w->set_persistable(true);
+
+  if (enable_int8) {
+    QuantizationInfo filter_qnt;
+    auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+    filter_qnt.enable_int8 = enable_int8;
+    filter_qnt.scale = weight_scale;
+    filter_qnt.quant_bits = bit_length;
+
+    auto transpose_w_data = transpose_w->mutable_data<int8_t>();
+    auto w_data = w->mutable_data<int8_t>();
+
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j++) {
+        transpose_w_data[j * k + i] = w_data[i * n + j];
+      }
+    }
+    trans_w_node = graph->Add(
+        w_name, *transpose_w, precision, w_type->layout(), filter_qnt);
+  } else {
+    auto transpose_w_data = transpose_w->mutable_data<float>();
+    auto w_data = w->mutable_data<float>();
+
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j++) {
+        transpose_w_data[j * k + i] = w_data[i * n + j];
+      }
+    }
+    trans_w_node =
+        graph->Add(w_name, *transpose_w, precision, w_type->layout());
+  }
+
+  // Add bias node if bias tensor exists
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      std::vector<int64_t> bias_shape = {n};
+
+      VLOG(3) << "[RKNPU] bias precision: "
+              << PrecisionToStr(bias->precision());
+      // We need to quantize bias
+      if (enable_int8) {
+        auto bias_name_qnt = bias_name + "/qnt";
+        auto* bias_qnt = scope->NewTensor(bias_name_qnt);
+        auto weight_scale =
+            op_info->GetAttr<std::vector<float>>("weight_scale");
+
+        bias_qnt->Resize(bias_shape);
+        bias_qnt->set_persistable(true);
+        bias_qnt->set_precision(PrecisionType::kInt32);
+
+        auto* bias_qnt_data = bias_qnt->mutable_data<int32_t>();
+        auto* bias_data = bias->mutable_data<float>();
+
+        QuantizationInfo qnt;
+        qnt.enable_int8 = enable_int8;
+        qnt.quant_bits = 32;
+        qnt.scale.resize(weight_scale.size());
+
+        for (int i = 0; i < weight_scale.size(); i++) {
+          qnt.scale[i] = input_scale * weight_scale[i];
+        }
+
+        auto dtype_max = static_cast<int>((1 << (qnt.quant_bits - 1)) - 1);
+        auto dtype_min = static_cast<int>(0 - dtype_max);
+
+        for (int i = 0; i < n; i++) {
+          bias_qnt_data[i] =
+              std::min(std::max(static_cast<int>(bias_data[i] / qnt.scale[i]),
+                                dtype_min),
+                       dtype_max);
+        }
+
+        bias_node = graph->Add(
+            bias_name, *bias_qnt, bias_qnt->precision(), layout, qnt);
+      } else {
+        bias_node = graph->Add(bias_name, *bias, bias_shape);
+      }
+    }
+  } else {
+    auto bias_name = w_name + "/bias/dummy";
+    auto* bias = scope->NewTensor(bias_name);
+    std::vector<int64_t> bias_shape = {n};
+
+    bias->Resize(bias_shape);
+    bias->set_persistable(true);
+
+    if (enable_int8) {
+      auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+      bias->set_precision(PrecisionType::kInt32);
+      auto* bias_data = bias->mutable_data<int32_t>();
+
+      for (int i = 0; i < n; i++) {
+        bias_data[i] = 0;
+      }
+
+      QuantizationInfo qnt;
+      qnt.enable_int8 = enable_int8;
+      qnt.quant_bits = 32;
+      qnt.scale.resize(weight_scale.size());
+
+      for (int i = 0; i < weight_scale.size(); i++) {
+        qnt.scale[i] = input_scale * weight_scale[i];
+      }
+
+      bias_node = graph->Add(bias_name, *bias, bias->precision(), layout, qnt);
+    } else {
+      bias->set_precision(PrecisionType::kFloat);
+      auto* bias_data = bias->mutable_data<float>();
+
+      for (int i = 0; i < n; i++) {
+        bias_data[i] = 0.0;
+      }
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
+    }
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.clear();
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(input_node->data());
+  inputs.push_back(trans_w_node->data());
+  inputs.push_back(bias_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::FCAttr attrs;
+  attrs.weights = n;
+  attrs.has_relu = false;
+
+  auto rGraph = graph->GetHandle();
+  auto fc = rGraph->AddOperator(
+      rk::nn::OperatorType::FULLCONNECT, inputs, outputs, &attrs);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(fc,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::FCConverter);
diff --git a/lite/kernels/rknpu/bridges/graph.cc b/lite/kernels/rknpu/bridges/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c1297c2e7e14d2138e05c4949573fd1db7cc235
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/graph.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include <rknpu/graph.h>
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    // Only variable node can be shared with the same name
+    if (!node->is_var() || !it->second.back()->is_var()) {
+      LOG(FATAL) << "[RKNPU] Const or data node " << name << " is redefined.";
+      return -1;
+    }
+  } else {
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  it->second.push_back(node);
+  return it->second.size();
+}
+
+// Const or data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const Tensor& tensor,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout,
+                                 const QuantizationInfo& qnt) {
+  std::shared_ptr<Node> node = nullptr;
+
+  if (precision == PrecisionType::kUnk) {
+    precision = tensor.precision();  // todo
+  }
+
+  if (precision == PrecisionType::kUnk) {
+    if (qnt.enable_int8 && qnt.quant_bits == 8) {
+      precision = PrecisionType::kInt8;
+    } else if (!qnt.enable_int8) {
+      precision = PrecisionType::kFloat;
+    } else {
+      LOG(ERROR) << "[rknpu]:Graph:: tensor precision unknown!";
+    }
+  }
+
+  if (precision != tensor.precision()) {
+    LOG(INFO) << "[rknpu]:Graph::Add: tensor precision mismatch!" << name << ":"
+              << PrecisionToStr(precision) << " vs "
+              << PrecisionToStr(tensor.precision());
+  }
+
+  if (tensor.persistable()) {
+    // Const node
+    node = std::make_shared<Node>(precision, layout, Node::Role::kConst);
+    auto idx = Add(name, node);
+    CHECK_EQ(idx, 1);
+    auto attr = std::make_shared<rk::nn::TensorAttr>();
+    attr->precision = ToRknpuPrecisionType(precision);
+    attr->layout = ToRknpuDataLayoutType(layout);
+    attr->role = rk::nn::TensorRole::CONST;
+    attr->name = name;
+
+    switch (precision) {
+      case PrecisionType::kInt8:
+        attr->qntBits = 8;
+        attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+        attr->qntParamSymmetric.scale = qnt.scale;
+        break;
+      case PrecisionType::kInt32:
+        attr->qntBits = 32;
+        attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+        attr->qntParamSymmetric.scale = qnt.scale;
+        break;
+      default:
+        break;
+    }
+
+    attr->dims.resize(shape.size());
+    for (int i = 0; i < shape.size(); i++) {
+      attr->dims[i] = shape[i];
+    }
+
+    LOG(INFO) << "[rknpu]:Graph::Add const node:" << name
+              << " precision: " << PrecisionToStr(precision)
+              << " layout: " << DataLayoutToStr(layout);
+    node->set_data(
+        rgraph_->CreateTensor(attr, const_cast<void*>(tensor.raw_data())));
+  } else {
+    // Data node
+    node = Add(name, shape, precision, layout, qnt);
+  }
+  return node;
+}
+
+// Data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout,
+                                 const QuantizationInfo& qnt) {
+  auto node = std::make_shared<Node>(precision, layout, Node::Role::kData);
+  auto idx = Add(name, node);
+  CHECK_EQ(idx, 1);
+  auto attr = std::make_shared<rk::nn::TensorAttr>();
+  attr->precision = ToRknpuPrecisionType(precision);
+  attr->layout = ToRknpuDataLayoutType(layout);
+  attr->role = rk::nn::TensorRole::VAR;
+  attr->name = name;
+
+  switch (precision) {
+    case PrecisionType::kInt8:
+      attr->qntBits = 8;
+      attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+      attr->qntParamSymmetric.scale = qnt.scale;
+      break;
+    case PrecisionType::kInt32:
+      attr->qntBits = 32;
+      attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+      attr->qntParamSymmetric.scale = qnt.scale;
+      break;
+
+    default:
+      break;
+  }
+
+  attr->dims.resize(shape.size());
+  for (int i = 0; i < shape.size(); i++) {
+    attr->dims[i] = shape[i];
+  }
+
+  LOG(INFO) << "[rknpu]:Graph::Add data node:" << name
+            << " precision: " << PrecisionToStr(precision)
+            << " layout: " << DataLayoutToStr(layout);
+  node->set_data(rgraph_->CreateTensor(attr, nullptr));  // todo
+  return node;
+}
+
+Graph::Graph() {
+  rgraph_ = new rk::nn::Graph();
+  CHECK(rgraph_ != nullptr);
+}
+
+Graph::~Graph() {}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/rknpu/bridges/graph.h b/lite/kernels/rknpu/bridges/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..a106d282de9e2c13f422dd5d8bd736968741a6d6
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/graph.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+// Graph and node is defined to collect all of converted RKNPU IR nodes
+struct QuantizationInfo {
+  int enable_int8;
+  int quant_bits;
+  std::vector<float> scale;
+};
+
+class Node {
+ public:
+  enum class Role {
+    kVar = 0,
+    kConst,
+    kData,
+  };
+
+  Node(std::shared_ptr<rk::nn::Tensor> data,
+       PrecisionType precision,
+       DataLayoutType layout,
+       Role role)
+      : data_(data), precision_(precision), layout_(layout), role_(role) {}
+  Node(PrecisionType precision, DataLayoutType layout, Role role)
+      : precision_(precision), layout_(layout), role_(role) {}
+
+  void set_data(std::shared_ptr<rk::nn::Tensor> data) { data_ = data; }
+  void set_precision(PrecisionType precision) { precision_ = precision; }
+  void set_layout(DataLayoutType layout) { layout_ = layout; }
+  void set_role(Role role) { role_ = role; }
+  void set_quant_param(const QuantizationInfo& qnt) { qnt_ = qnt; }
+
+  std::shared_ptr<rk::nn::Tensor> data() { return data_; }
+  PrecisionType precision() const { return precision_; }
+  DataLayoutType layout() const { return layout_; }
+  Role role() const { return role_; }
+  bool is_var() const { return role_ == Role::kVar; }
+  bool is_const() const { return role_ == Role::kConst; }
+  bool is_data() const { return role_ == Role::kData; }
+
+ private:
+  std::shared_ptr<rk::nn::Tensor> data_{nullptr};
+  PrecisionType precision_{PRECISION(kFloat)};
+  DataLayoutType layout_{DATALAYOUT(kNCHW)};
+  Role role_{Role::kVar};
+  QuantizationInfo qnt_;
+};
+
+class Graph {
+ public:
+  Graph();
+  ~Graph();
+
+ public:
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kUnk),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo());
+  std::shared_ptr<Node> Get(const std::string& name) {
+    CHECK(Has(name)) << "[RKNPU] Node " << name << " not found.";
+    return nodes_.at(name).back();
+  }
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            PrecisionType precision = PRECISION(kUnk),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo()) {
+    return Add(name, tensor, tensor.dims().Vectorize(), precision, layout, qnt);
+  }
+
+  // Data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo());
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            DDim dims,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo()) {
+    return Add(name, dims.Vectorize(), precision, layout, qnt);
+  }
+
+  bool Has(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+  rk::nn::Graph* GetHandle() { return rgraph_; }
+
+ private:
+  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
+  rk::nn::Graph* rgraph_;
+};
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/rknpu/bridges/paddle_use_bridges.h b/lite/kernels/rknpu/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..e63033bfcc01ba66e0b01c01aedd15319a3968ce
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/paddle_use_bridges.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(relu, kRKNPU);
+USE_SUBGRAPH_BRIDGE(conv2d, kRKNPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kRKNPU);
+
+USE_SUBGRAPH_BRIDGE(pool2d, kRKNPU);
+USE_SUBGRAPH_BRIDGE(fc, kRKNPU);
+USE_SUBGRAPH_BRIDGE(softmax, kRKNPU);
+USE_SUBGRAPH_BRIDGE(batch_norm, kRKNPU);
+USE_SUBGRAPH_BRIDGE(concat, kRKNPU);
+
+USE_SUBGRAPH_BRIDGE(elementwise_add, kRKNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kRKNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kRKNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_div, kRKNPU);
diff --git a/lite/kernels/rknpu/bridges/pool_op.cc b/lite/kernels/rknpu/bridges/pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d6f8e11e57f0528acdc8ef526186e56a2f5545d
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/pool_op.cc
@@ -0,0 +1,187 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pool_op.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto out_name = op_info->Output("Out").front();
+  auto output = scope->FindMutableTensor(out_name);
+
+  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (x->precision() == PRECISION(kInt8)) {
+    // enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    enable_int8 = true;
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+      LOG(WARNING) << "[RKNPU] Pooling int8";
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = bit_length;
+    }
+    x_node = graph->Add(x_name, *x, x->precision(), layout, qnt);
+  }
+
+  // pool mode
+  rk::nn::PoolType mode = rk::nn::PoolType::POOLING_UNKNOWN;
+  if (pooling_type == "max") {
+    mode = rk::nn::PoolType::POOLING_MAX;
+  } else if (pooling_type == "avg") {
+    mode = rk::nn::PoolType::POOLING_AVG;
+  } else {
+    LOG(WARNING) << "[RKNPU] Unsupported pooling type: " << pooling_type;
+    return FAILED;
+  }
+
+  // pad mode
+  rk::nn::PadType pad_mode = rk::nn::PadType::AUTO;
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  if (padding_algorithm == "SAME") {
+    pad_mode = rk::nn::PadType::SAME;
+  } else if (padding_algorithm == "VALID") {
+    pad_mode = rk::nn::PadType::VALID;
+  }
+
+  // paddings and strides
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[NPU] Paddings size should be the same or twice as the inputs size.";
+
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  lite::operators::UpdatePadding(&paddings,
+                                 global_pooling,
+                                 adaptive,
+                                 padding_algorithm,
+                                 x->dims(),
+                                 strides,
+                                 ksize);
+
+  // ceil mode
+  int ceil_mode = 0;
+  if (op_info->HasAttr("ceil_mode")) {
+    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::PoolAttr attrs;
+  attrs.ksize[0] = ksize[0];
+  attrs.ksize[1] = ksize[1];
+  attrs.stride[0] = strides[0];
+  attrs.stride[1] = strides[1];
+  attrs.pad[0] = paddings[0];
+  attrs.pad[1] = paddings[1];
+  attrs.pad[2] = paddings[2];
+  attrs.pad[3] = paddings[3];
+  attrs.pad_type = pad_mode;
+  attrs.pool_type = mode;
+  attrs.global_pooling = global_pooling;
+
+  if (ceil_mode) {
+    attrs.round_type = rk::nn::RoundType::ROUND_CEIL;
+  } else {
+    attrs.round_type = rk::nn::RoundType::ROUND_FLOOR;
+  }
+
+  auto rGraph = graph->GetHandle();
+  auto pool =
+      rGraph->AddOperator(rk::nn::OperatorType::POOL, inputs, outputs, &attrs);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::PoolConverter);
diff --git a/lite/kernels/rknpu/bridges/softmax_op.cc b/lite/kernels/rknpu/bridges/softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ec0b9c7462526f0409a634159d17d5afbd795f5
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/softmax_op.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto x_rank = x_dims.size();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto output = scope->FindMutableTensor(out_name);
+  auto axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis += x_rank;
+  }
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = bit_length;
+    }
+    x_node = graph->Add(x_name, *x, precision, layout, qnt);
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::SoftmaxAttr attrs;
+  attrs.axis = axis;
+  attrs.beta = 1.0;
+
+  auto rGraph = graph->GetHandle();
+  auto softmax = rGraph->AddOperator(
+      rk::nn::OperatorType::SOFTMAX, inputs, outputs, &attrs);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::SoftmaxConverter);
diff --git a/lite/kernels/rknpu/bridges/utility.cc b/lite/kernels/rknpu/bridges/utility.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df236951ff1c4ede5fed11286fa7547903611fb4
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/utility.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/rknpu/bridges/utility.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+rk::nn::PrecisionType ToRknpuPrecisionType(PrecisionType precision) {
+  rk::nn::PrecisionType t = rk::nn::PrecisionType::UNKNOWN;
+
+  switch (precision) {
+    case PrecisionType::kFloat:
+      t = rk::nn::PrecisionType::FLOAT32;
+      break;
+    case PrecisionType::kFP16:
+      t = rk::nn::PrecisionType::FLOAT16;
+      break;
+    case PrecisionType::kInt16:
+      t = rk::nn::PrecisionType::INT16;
+      break;
+    case PrecisionType::kInt32:
+      t = rk::nn::PrecisionType::INT32;
+      break;
+    case PrecisionType::kInt64:
+      t = rk::nn::PrecisionType::INT64;
+      break;
+    case PrecisionType::kInt8:
+      t = rk::nn::PrecisionType::INT8;
+      break;
+    case PrecisionType::kBool:
+      t = rk::nn::PrecisionType::BOOL8;
+      break;
+    default:
+      break;
+  }
+
+  return t;
+}
+
+rk::nn::DataLayoutType ToRknpuDataLayoutType(DataLayoutType layout) {
+  rk::nn::DataLayoutType t = rk::nn::DataLayoutType::UNKNOWN;
+
+  switch (layout) {
+    case DataLayoutType::kNCHW:
+      t = rk::nn::DataLayoutType::NCHW;
+      break;
+    case DataLayoutType::kNHWC:
+      t = rk::nn::DataLayoutType::NHWC;
+      break;
+    default:
+      break;
+  }
+
+  return t;
+}
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/rknpu/bridges/utility.h b/lite/kernels/rknpu/bridges/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e8e5b5c97cbb00e784b7cbecf25e7238d271520
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/utility.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+rk::nn::PrecisionType ToRknpuPrecisionType(PrecisionType precision);
+rk::nn::DataLayoutType ToRknpuDataLayoutType(DataLayoutType layout);
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/rknpu/subgraph_compute.cc b/lite/kernels/rknpu/subgraph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0b63205705609b6899918ce8e254ccdf6cbad47
--- /dev/null
+++ b/lite/kernels/rknpu/subgraph_compute.cc
@@ -0,0 +1,239 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/rknpu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/backends/rknpu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+#include "rknpu/rknpu_pub.h"  // NOLINT
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace rknpu {
+
+int SubgraphEngine::BuildDeviceProgram() {
+  LOG(INFO) << "[RKNPU]:BuildDeviceProgram";
+  int status = 0;
+  // Convert all of ops and their input vars and weights and added into the NPU
+  // RKNPU IR graph
+  subgraph::rknpu::Graph graph;
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = const_cast<OpLite*>(inst.op());
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kRKNPU))) {
+      return subgraph::FAILED;
+    }
+    auto kernel = inst.kernel();
+    status |= bridges.Select(op_type, TARGET(kRKNPU))(
+        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  // Collect the valid input and output nodes in the RKNPU IR graph and update
+  // the input and output names
+  device_inames_.clear();
+  device_onames_.clear();
+
+  for (auto& input_name : input_names_) {
+    LOG(INFO) << "[RKNPU] Input node " << input_name;
+    if (graph.Has(input_name)) {
+      LOG(INFO) << input_name << " Precision "
+                << PrecisionToStr(graph.Get(input_name)->precision());
+      device_itensors_.push_back(graph.Get(input_name)->data());
+      device_inames_.push_back(input_name);
+    } else {
+      LOG(WARNING) << "[RKNPU] Input node " << input_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+
+  for (auto& output_name : output_names_) {
+    LOG(INFO) << "[RKNPU] Output node " << output_name;
+    if (graph.Has(output_name)) {
+      auto tensor = scope_->FindMutableTensor(output_name);
+      LOG(INFO) << output_name << " Precision "
+                << PrecisionToStr(tensor->precision());
+      device_otensors_.push_back(graph.Get(output_name)->data());
+      device_onames_.push_back(output_name);
+    } else {
+      LOG(WARNING) << "[RKNPU] Output node " << output_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+  CHECK(!device_inames_.empty())
+      << "[RKNPU] No input nodes found for building NPU model";
+  CHECK(!device_onames_.empty())
+      << "[RKNPU] No output nodes found for building NPU model";
+
+  device_program_ = lite::rknpu::Device::Global().Build(
+      model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[RKNPU] Build model failed!";
+    return subgraph::FAILED;
+  }
+
+  // input
+  origin_idims_.resize(input_names_.size());
+  origin_itensors_.resize(input_names_.size());
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+  }
+  // output
+  origin_odims_.resize(output_names_.size());
+  origin_otensors_.resize(output_names_.size());
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+
+    auto output_dims = origin_otensors_[i]->dims();
+  }
+
+  origin_idims_.resize(device_inames_.size());
+  origin_itensors_.resize(device_inames_.size());
+  device_itensors_.resize(device_inames_.size());
+  origin_odims_.resize(device_onames_.size());
+  origin_otensors_.resize(device_onames_.size());
+  device_otensors_.resize(device_onames_.size());
+  for (int i = 0; i < device_inames_.size(); i++) {
+    auto node = graph.Get(device_inames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+
+    LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << device_inames_[i]
+              << " precision: " << PrecisionToStr(precision)
+              << " layout: " << DataLayoutToStr(layout);
+  }
+  for (int i = 0; i < device_onames_.size(); i++) {
+    auto node = graph.Get(device_onames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << device_onames_[i]
+              << " precision: " << PrecisionToStr(precision)
+              << " layout: " << DataLayoutToStr(layout);
+    // Prepare the device output tensors
+    switch (precision) {
+      case PRECISION(kFloat):
+        origin_otensors_[i]->mutable_data<float>();
+        break;
+      case PRECISION(kInt8):
+        origin_otensors_[i]->mutable_data<int8_t>();
+        break;
+      case PRECISION(kInt16):
+        origin_otensors_[i]->mutable_data<int16_t>();
+        break;
+      case PRECISION(kInt32):
+        origin_otensors_[i]->mutable_data<int32_t>();
+        break;
+      case PRECISION(kInt64):
+        origin_otensors_[i]->mutable_data<int64_t>();
+        break;
+      default:
+        LOG(FATAL) << "[RKNPU] " << device_onames_[i]
+                   << " can't mutable data with precision type "
+                   << PrecisionToStr(precision);
+        break;
+    }
+  }
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  LOG(INFO) << "[RKNPU]:LaunchDeviceProgram";
+  std::vector<rk::nn::InputInfo> inputs;
+  std::vector<rk::nn::OutputInfo> outputs;
+
+  inputs.resize(device_itensors_.size());
+  for (size_t i = 0; i < device_itensors_.size(); i++) {
+    inputs[i].index = i;
+    inputs[i].buf = const_cast<void*>(origin_itensors_[i]->raw_data());
+    inputs[i].size = origin_itensors_[i]->memory_size();
+    inputs[i].pass_through = false;
+    inputs[i].type =
+        subgraph::rknpu::ToRknpuPrecisionType(origin_itensors_[i]->precision());
+    inputs[i].layout = rk::nn::DataLayoutType::NCHW;
+  }
+
+  outputs.resize(device_otensors_.size());
+  for (size_t i = 0; i < device_otensors_.size(); i++) {
+    outputs[i].index = i;
+    outputs[i].buf = const_cast<void*>(origin_otensors_[i]->raw_data());
+    outputs[i].size = origin_otensors_[i]->memory_size();
+    outputs[i].want_float = false;
+  }
+
+  device_program_->SetInputs(inputs);
+  device_program_->Run();
+  device_program_->GetOutputs(outputs);
+  return 0;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  LOG(INFO) << "[RKNPU]:PrepareForRun";
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  LOG(INFO) << "[RKNPU]:Run";
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace rknpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kRKNPU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::rknpu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/rknpu/subgraph_compute.h b/lite/kernels/rknpu/subgraph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..863e6aef39ad54f0e9d94d4b507c6fca4128ebb8
--- /dev/null
+++ b/lite/kernels/rknpu/subgraph_compute.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/program.h"
+#include "lite/core/types.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace rknpu {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+
+  std::string model_name_;
+  std::vector<std::string> device_inames_;
+  std::vector<std::string> device_onames_;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_;
+  std::unique_ptr<rk::nn::Exection> device_program_{nullptr};
+};
+
+class SubgraphCompute
+    : public KernelLite<TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace rknpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt
index 7840565f5d713149d8e752c6f0723cefaea85be8..af0c14c572fa47cb8b8419cafc1f5f83b44a9b8a 100644
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -2,7 +2,7 @@ if(NOT LITE_WITH_X86)
     return()
 endif()
 
-add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops math_function)
+add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_function)
 # lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps})
@@ -24,12 +24,18 @@ add_kernel(stack_compute_x86 X86 basic SRCS stack_compute.cc DEPS ${lite_kernel_
 add_kernel(dropout_compute_x86 X86 basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(transpose_compute_x86 X86 basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_function)
 add_kernel(layer_norm_compute_x86 X86 basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
-add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
+# todo: fc x86 kernel can not compile successfully on mac because openmp is not supported on mac clang,
+# this problem should be fixed later to support fc x86 kernel on mac. @DannyIsFunny
+if(NOT APPLE)
+    add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
+endif()
 # lite_cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(uniform_random_compute_x86 SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps} )
 add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute)
 #add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(sequence_unpad_compute_x86 X86 basic SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps} sequence_padding)
+add_kernel(sequence_conv_compute_x86 X86 basic SRCS sequence_conv_compute.cc DEPS ${lite_kernel_deps} math_function blas context_project)
 
 # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
 add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)
diff --git a/lite/kernels/x86/activation_compute.h b/lite/kernels/x86/activation_compute.h
index 5d8110e67c17f3a0f8d3211179df831dad83cc9b..520adaf44f808748c75960f88cd07799c9f2d4ed 100644
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
@@ -16,12 +16,18 @@
 #include <algorithm>
 #include <utility>
 #include <vector>
+
+#include <cmath>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
 #include "lite/backends/x86/math/blas.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/fluid/eigen.h"
-#include "lite/operators/activation_ops.h"
+#include "lite/operators/op_params.h"
 
 namespace paddle {
 namespace lite {
@@ -231,8 +237,8 @@ class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     // auto& context = ctx_->As<X86Context>();
     auto& param = *param_.get_mutable<operators::ActivationParam>();
 
-    const T* x_data = param.X->data<T>();
-    T* out_data = param.Out->mutable_data<T>();
+    const T* x_data = param.X->template data<T>();
+    T* out_data = param.Out->template mutable_data<T>();
     size_t x_size = param.X->numel();
     for (size_t i = 0; i < x_size; i++) {
       out_data[i] = x_data[i] / (static_cast<T>(1) + std::abs(x_data[i]));
diff --git a/lite/kernels/x86/attention_padding_mask_compute.h b/lite/kernels/x86/attention_padding_mask_compute.h
index b9124e5ad49a0d68c41a21fe55d28102f09d14b9..f6d3d5aa31df1f188c196ac283c734c879f40244 100644
--- a/lite/kernels/x86/attention_padding_mask_compute.h
+++ b/lite/kernels/x86/attention_padding_mask_compute.h
@@ -45,9 +45,9 @@ class AttentionPaddingMaskCompute
     auto src_len = static_cast<int64_t>(bottom1->lod()[0][1]);
     const int att_batch = bottom0->lod()[0].size() - 1;
     const int src_batch = bottom1->lod()[0].size() - 1;
-    int* pad_begin = _pad_begin->mutable_data<int>();
+    int* pad_begin = _pad_begin->template mutable_data<int>();
     for (int i = 0; i < src_batch; ++i) {
-      const auto* src_data = bottom1->data<T>() + src_len * i;
+      const auto* src_data = bottom1->template data<T>() + src_len * i;
       int index = src_len - 1;
       for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
            --index) {
@@ -56,13 +56,14 @@ class AttentionPaddingMaskCompute
     }
 
     const auto att_len = static_cast<int64_t>(bottom0->lod()[0][1]);
-    auto* top_data = top->mutable_data<T>();
+    auto* top_data = top->template mutable_data<T>();
     memcpy(top_data,
-           bottom0->data<T>(),
+           bottom0->template data<T>(),
            bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T));
     for (int i = 0; i < att_batch; ++i) {
       for (int j = 0; j < att_len; ++j) {
-        top_data = top->mutable_data<T>() + src_len * (att_len * i + j);
+        top_data =
+            top->template mutable_data<T>() + src_len * (att_len * i + j);
         int src_idx = i % src_batch;
         for (int k = pad_begin[src_idx]; k < src_len; ++k) {
           top_data[k] = _mask;
diff --git a/lite/kernels/x86/batch_norm_compute.h b/lite/kernels/x86/batch_norm_compute.h
index 092280752cb92e1784eefc09cb26fa3bea8eb939..0f206b8c32aaaf9b3a1b278a69f3a9aa77a11ba6 100644
--- a/lite/kernels/x86/batch_norm_compute.h
+++ b/lite/kernels/x86/batch_norm_compute.h
@@ -59,26 +59,26 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     const int sample_size = x->dims().production() / N / C;
 
     // alloc memory
-    param.y->mutable_data<T>();
+    param.y->template mutable_data<T>();
     if (!param.is_test) {
-      param.mean_out->mutable_data<T>();
-      param.variance_out->mutable_data<T>();
-      param.saved_mean->mutable_data<T>();
-      param.saved_variance->mutable_data<T>();
+      param.mean_out->template mutable_data<T>();
+      param.variance_out->template mutable_data<T>();
+      param.saved_mean->template mutable_data<T>();
+      param.saved_variance->template mutable_data<T>();
     }
     if (!global_stats) {
       // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(param.saved_mean->mutable_data<T>(),
-                                          C);
+      EigenVectorArrayMap<T> saved_mean_e(
+          param.saved_mean->template mutable_data<T>(), C);
       EigenVectorArrayMap<T> saved_variance_e(
-          param.saved_variance->mutable_data<T>(), C);
+          param.saved_variance->template mutable_data<T>(), C);
       saved_mean_e.setZero();
       saved_variance_e.setZero();
 
-      EigenVectorArrayMap<T> running_mean_arr(param.mean_out->mutable_data<T>(),
-                                              C);
+      EigenVectorArrayMap<T> running_mean_arr(
+          param.mean_out->template mutable_data<T>(), C);
       EigenVectorArrayMap<T> running_var_arr(
-          param.variance_out->mutable_data<T>(), C);
+          param.variance_out->template mutable_data<T>(), C);
 
       if ((N * sample_size) == 1) {
         LOG(WARNING) << "Only 1 element in normalization dimension, "
@@ -89,7 +89,8 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
       switch (param.data_layout) {
         case DATALAYOUT(kNCHW): {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+          ConstEigenArrayMap<T> x_arr(
+              x->template data<T>(), sample_size, N * C);
           for (int nc = 0; nc < N * C; ++nc) {
             saved_mean_e(nc % C) += x_arr.col(nc).sum();
           }
@@ -115,33 +116,37 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     // use SavedMean and SavedVariance to do normalize
     Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
     if (global_stats) {
-      ConstEigenVectorArrayMap<T> var_arr(param.variance->data<T>(), C);
+      ConstEigenVectorArrayMap<T> var_arr(param.variance->template data<T>(),
+                                          C);
       inv_std = (var_arr + param.epsilon).sqrt().inverse();
     } else {
       EigenVectorArrayMap<T> saved_inv_std(
-          param.saved_variance->mutable_data<T>(), C);
+          param.saved_variance->template mutable_data<T>(), C);
       // inverse SavedVariance first, gradient will use it too.
       saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt();
       inv_std = saved_inv_std;
     }
 
     ConstEigenVectorArrayMap<T> mean_arr(
-        global_stats ? param.mean->data<T>() : param.saved_mean->data<T>(), C);
+        global_stats ? param.mean->template data<T>()
+                     : param.saved_mean->template data<T>(),
+        C);
 
     //   ((x - est_mean) * (inv_var) * scale + bias
     //   formula transform ====>
     //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
 
-    ConstEigenVectorArrayMap<T> scale_arr(param.scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(param.bias->data<T>(), C);
+    ConstEigenVectorArrayMap<T> scale_arr(param.scale->template data<T>(), C);
+    ConstEigenVectorArrayMap<T> bias_arr(param.bias->template data<T>(), C);
     Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
     Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
         bias_arr - mean_arr * inv_std * scale_arr;
 
     switch (param.data_layout) {
       case DATALAYOUT(kNCHW): {
-        EigenArrayMap<T> y_arr(param.y->mutable_data<T>(), sample_size, N * C);
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+        EigenArrayMap<T> y_arr(
+            param.y->template mutable_data<T>(), sample_size, N * C);
+        ConstEigenArrayMap<T> x_arr(x->template data<T>(), sample_size, N * C);
         for (int nc = 0; nc < N * C; ++nc) {
           y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
         }
diff --git a/lite/kernels/x86/concat_compute.h b/lite/kernels/x86/concat_compute.h
index 935f0811d4e7a7cbe2ce5fafa61b6d16a25d4a81..e423cd04f16917f200f45ac93d9a6a09f3fb1c54 100644
--- a/lite/kernels/x86/concat_compute.h
+++ b/lite/kernels/x86/concat_compute.h
@@ -47,7 +47,7 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     int64_t axis = static_cast<int64_t>(param.axis);
     auto* axis_tensor = param.axis_tensor;
     if (axis_tensor != nullptr) {
-      auto* axis_tensor_data = axis_tensor->data<int>();
+      auto* axis_tensor_data = axis_tensor->template data<int>();
       axis = static_cast<int64_t>(axis_tensor_data[0]);
     }
 
@@ -60,7 +60,7 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     int concat_input_size = count(axis + 1, x_dims.size(), x_dims);
     const int top_concat_axis = out->dims()[axis];
     for (size_t i = 0; i < param.x.size(); ++i) {
-      const T* bottom_data = param.x[i]->data<T>();
+      const T* bottom_data = param.x[i]->template data<T>();
       const int64_t bottom_concat_axis = param.x[i]->dims()[axis];
       for (int n = 0; n < num_concat; ++n) {
         std::memcpy(
diff --git a/lite/kernels/x86/conv_compute.h b/lite/kernels/x86/conv_compute.h
index e9f403059f90cf6635bc22db3e6890b86cbe85f6..29442158c756418327dd3de31fd4dfdbec2cbc1d 100644
--- a/lite/kernels/x86/conv_compute.h
+++ b/lite/kernels/x86/conv_compute.h
@@ -52,7 +52,7 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& context = ctx_->As<X86Context>();
     auto& param = *param_.get_mutable<operators::ConvParam>();
     lite::Tensor filter = *param.filter;
-    param.output->mutable_data<T>();
+    param.output->template mutable_data<T>();
     const int batch_size = static_cast<int>(param.x->dims()[0]);
 
     std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
@@ -95,9 +95,9 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto blas =
         paddle::lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
     for (int i = 0; i < batch_size; i++) {
-      lite::Tensor in_batch = param.x->Slice<T>(i, i + 1);
+      lite::Tensor in_batch = param.x->template Slice<T>(i, i + 1);
       in_batch.Resize(input_shape);
-      lite::Tensor out_batch = param.output->Slice<T>(i, i + 1);
+      lite::Tensor out_batch = param.output->template Slice<T>(i, i + 1);
       out_batch.Resize(output_matrix_shape);
       for (int g = 0; g < param.groups; g++) {
         lite::Tensor in_slice =
diff --git a/lite/kernels/x86/dropout_compute.h b/lite/kernels/x86/dropout_compute.h
index 2ba383bdbdc99e7643f3bf09350f833665c8548e..4b5f3359501b8b4c801c395dfa7d5990d9d4d7a3 100644
--- a/lite/kernels/x86/dropout_compute.h
+++ b/lite/kernels/x86/dropout_compute.h
@@ -38,10 +38,10 @@ class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   using param_t = operators::DropoutParam;
   void Run() override {
     auto& param = *param_.get_mutable<operators::DropoutParam>();
-    const auto* x_data = param.x->data<T>();
-    auto* out_data = param.output->mutable_data<T>();
+    const auto* x_data = param.x->template data<T>();
+    auto* out_data = param.output->template mutable_data<T>();
     if (!param.is_test) {
-      auto* mask_data = param.mask->mutable_data<T>();
+      auto* mask_data = param.mask->template mutable_data<T>();
       std::random_device rnd;
       std::minstd_rand engine;
       int seed = param.fix_seed ? param.seed : rnd();
diff --git a/lite/kernels/x86/elementwise_op_function.h b/lite/kernels/x86/elementwise_op_function.h
index 40116479f6f4d6dc8658c2d781a48b7a07dd20c9..42ea38d979e39f97a8aef971370c83303c53c48f 100644
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
@@ -248,8 +248,8 @@ class TransformFunctor {
                    lite::Tensor *z,
                    const lite::Context<Target> &ctx,
                    Functor func)
-      : x_(x->data<T>()),
-        y_(y->data<T>()),
+      : x_(x->template data<T>()),
+        y_(y->template data<T>()),
         z_(z->mutable_data<OutType>()),
         nx_(x->numel()),
         ctx_(ctx),
@@ -483,9 +483,10 @@ void FusedElemwiseAndActComputeNoBroadcast(const lite::Context<Target> &ctx,
           x.data<T>(),
           y.data<T>(),
           compound_functor,
-          out->mutable_data<T>(),
-          intermediate_out == nullptr ? nullptr
-                                      : intermediate_out->mutable_data<T>()});
+          out->template mutable_data<T>(),
+          intermediate_out == nullptr
+              ? nullptr
+              : intermediate_out->template mutable_data<T>()});
 }
 
 template <lite::TargetType Target,
@@ -523,9 +524,10 @@ void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx,
         compound_functor,
         h,
         w,
-        out->mutable_data<T>(),
-        intermediate_out == nullptr ? nullptr
-                                    : intermediate_out->mutable_data<T>());
+        out->template mutable_data<T>(),
+        intermediate_out == nullptr
+            ? nullptr
+            : intermediate_out->template mutable_data<T>());
 
   } else {
     FusedElemwiseAndActBroadcast2CPU<T,
@@ -539,9 +541,10 @@ void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx,
         n,
         post,
         compound_functor,
-        out->mutable_data<T>(),
-        intermediate_out == nullptr ? nullptr
-                                    : intermediate_out->mutable_data<T>());
+        out->template mutable_data<T>(),
+        intermediate_out == nullptr
+            ? nullptr
+            : intermediate_out->template mutable_data<T>());
   }
 }
 
diff --git a/lite/kernels/x86/fc_compute.h b/lite/kernels/x86/fc_compute.h
index e719b8d2216949746f612bca0689c22be0606031..9f25a2584fe8d2579939e144d6799ba79927ae63 100644
--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
@@ -140,9 +140,9 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
     int M = output->dims().production() / w_dims1;
 
-    const T* input_data = input->data<T>();
-    const T* w_data = w->data<T>();
-    T* output_data = output->mutable_data<T>();
+    const T* input_data = input->template data<T>();
+    const T* w_data = w->template data<T>();
+    T* output_data = output->template mutable_data<T>();
 
     auto& context = ctx_->As<X86Context>();
     FCFunctor<lite::TargetType::kX86, T> fc;
@@ -153,7 +153,7 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
        input_data,
        w_data,
        output_data,
-       bias ? bias->data<T>() : NULL,
+       bias ? bias->template data<T>() : NULL,
        with_relu,
        padding_weights);
   }
diff --git a/lite/kernels/x86/fill_constant_batch_size_like_compute.h b/lite/kernels/x86/fill_constant_batch_size_like_compute.h
index 8d49b0816d85f30351a4ded81e0f6ef650b6c445..1c54912c21d1479b990c5a56064d9789e8619400 100644
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute.h
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute.h
@@ -42,9 +42,9 @@ class FillConstantBatchSizeLikeCompute
       int output_dim_idx = param.output_dim_idx;
       odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
       out->Resize(odims);
-      // out->mutable_data<T>();
+      // out->template mutable_data<T>();
     }
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
     auto value = param.value;
 
     paddle::lite::x86::math::SetConstant<lite::TargetType::kX86, T> setter;
diff --git a/lite/kernels/x86/gather_compute.h b/lite/kernels/x86/gather_compute.h
index 6ee270647f8fb7d7ec540047cd4d546a7eb89ce8..bd01d9da3af1640770838c262dcd848b557d40c3 100644
--- a/lite/kernels/x86/gather_compute.h
+++ b/lite/kernels/x86/gather_compute.h
@@ -50,9 +50,9 @@ void CPUGather(const lite::Tensor* src,
 
   auto src_dims = src->dims();
 
-  const T* p_src = src->data<T>();
+  const T* p_src = src->template data<T>();
   const IndexT* p_index = index->data<IndexT>();
-  T* p_output = output->mutable_data<T>();
+  T* p_output = output->template mutable_data<T>();
 
   // slice size
   int slice_size = 1;
@@ -77,7 +77,7 @@ class GatherCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto index = param.Index;
     auto out = param.Out;
 
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
     if (x->dims().production() == 0) return;
     /*
      * Since there's no type defined for lite::Tensor in Paddle-Lite, then
diff --git a/lite/kernels/x86/gru_compute.h b/lite/kernels/x86/gru_compute.h
index 89076b51dae1fed4b8f56b280f177caf1f142158..e701ba16a55e9695c6b70f07cc4e1443e6b75698 100644
--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
@@ -44,7 +44,7 @@ inline void ReorderInitState(const lite::Context<TARGET(kX86)>& context,
                              bool indexed_src) {
   lite::x86::math::CopyMatrixRowsFunctor<TARGET(kX86), T> row_shuffle;
   dst->Resize(src.dims());
-  dst->mutable_data<T>();
+  dst->template mutable_data<T>();
   row_shuffle(context, src, index_lod, dst, indexed_src);
 }
 
@@ -65,18 +65,19 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto* input = param.input;
     auto* h0 = param.h0;
     auto* weight = param.weight;
-    const T* weight_data = weight->data<T>();
+    const T* weight_data = weight->template data<T>();
     auto* bias = param.bias;
 
     auto* batch_gate = param.batch_gate;
     auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
     auto* batch_hidden = param.batch_hidden;
-    T* batch_gate_ptr = batch_gate->mutable_data<T>();
-    T* batch_reset_hidden_prev_ptr = batch_reset_hidden_prev->mutable_data<T>();
-    T* batch_hidden_ptr = batch_hidden->mutable_data<T>();
+    T* batch_gate_ptr = batch_gate->template mutable_data<T>();
+    T* batch_reset_hidden_prev_ptr =
+        batch_reset_hidden_prev->template mutable_data<T>();
+    T* batch_hidden_ptr = batch_hidden->template mutable_data<T>();
 
     auto* hidden = param.hidden;
-    hidden->mutable_data<T>();
+    hidden->template mutable_data<T>();
 
     const auto& hidden_dims = hidden->dims();
 
@@ -99,7 +100,7 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
       // Since the batch computing for GRU reorders the input sequences
       // according to their length. The initialized cell state also needs
       // to reorder.
-      const std::vector<size_t>& order(batch_gate->lod()[2]);
+      const std::vector<uint64_t>& order(batch_gate->lod()[2]);
       ReorderInitState<T>(context, *h0, order, &ordered_h0, true);
       gru_value.prev_out_value = ordered_h0.mutable_data<T>();
     } else {
diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h
index ca2ddf60c5e150ba7d2712ccb2e67e444cd07010..46d151bbc406e19b498b87420029da7f9c1c2f12 100644
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
@@ -47,9 +47,9 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
     auto x_dims = x->dims();
 
-    y->mutable_data<T>();
-    Mean->mutable_data<T>();
-    Var->mutable_data<T>();
+    y->template mutable_data<T>();
+    Mean->template mutable_data<T>();
+    Var->template mutable_data<T>();
 
     auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
     int left = static_cast<int>(matrix_dim[0]);
@@ -73,10 +73,10 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
                    .At(right);
     ker(in.mutable_data<T>(),
         out.mutable_data<T>(),
-        Mean->mutable_data<T>(),
-        Var->mutable_data<T>(),
-        Scale->data<T>(),
-        Bias->data<T>(),
+        Mean->template mutable_data<T>(),
+        Var->template mutable_data<T>(),
+        Scale->template data<T>(),
+        Bias->template data<T>(),
         static_cast<int>(left),
         epsilon,
         right);
diff --git a/lite/kernels/x86/lookup_table_compute.h b/lite/kernels/x86/lookup_table_compute.h
index 1801144f6eeb25a40fa052440b63913bc41a65a3..73cffe4ce8130b18612e42b0243205e74e011005 100644
--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
@@ -33,15 +33,15 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto *ids_t = param.Ids;
     auto *output_t = param.Out;
     int64_t padding_idx = param.padding_idx;
-    const int64_t *ids = ids_t->data<int64_t>();
+    const int64_t *ids = ids_t->template data<int64_t>();
     int64_t ids_numel = ids_t->dims().production();
 
     auto *table_t = param.W;
     int64_t row_number = table_t->dims()[0];
     int64_t row_width = table_t->dims()[1];
 
-    const T *table = table_t->data<T>();
-    T *output = output_t->mutable_data<T>();
+    const T *table = table_t->template data<T>();
+    T *output = output_t->template mutable_data<T>();
     memset(output, 0, output_t->dims().production() * sizeof(T));
     for (int64_t i = 0; i < ids_numel; ++i) {
       if (padding_idx != -1 && ids[i] == padding_idx) {
diff --git a/lite/kernels/x86/match_matrix_tensor_compute.cc b/lite/kernels/x86/match_matrix_tensor_compute.cc
index feda180d22e59b2ca0e8f0f89f3c7a1ddb8acd4a..171308b1a8b0294241e77366390c4828172bc077 100644
--- a/lite/kernels/x86/match_matrix_tensor_compute.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute.cc
@@ -35,7 +35,7 @@ void MatchMatrixTensorCompute<T>::Run() {
   const auto& offset_l = x->lod()[0];
   const auto& offset_r = y->lod()[0];
 
-  std::vector<size_t> top_offset;
+  std::vector<uint64_t> top_offset;
   int top_size = 0;
   top_offset.push_back(top_size);
   for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
@@ -97,9 +97,9 @@ void MatchMatrixTensorCompute<T>::Run() {
   int batch_size = x->lod()[0].size() - 1;
   int lod_lv1_size = batch_size * dim_t;
   int lod_lv2_size = x->lod()[0].back() * dim_t;
-  std::vector<size_t> out_lod0(batch_size + 1, 0);
-  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
-  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
+  std::vector<uint64_t> out_lod1(lod_lv1_size + 1, 0);
+  std::vector<uint64_t> out_lod2(lod_lv2_size + 1, 0);
   for (int i = 0; i < batch_size; i++) {
     out_lod0[i + 1] = out_lod0[i] + dim_t;
     int len_l = offset_l[i + 1] - offset_l[i];
diff --git a/lite/kernels/x86/matmul_compute.h b/lite/kernels/x86/matmul_compute.h
index 3d2b3c7482c266d0c8771c9be1dbac540a315528..e17f12b6b6471bfb587fc3866695b808e11122da 100644
--- a/lite/kernels/x86/matmul_compute.h
+++ b/lite/kernels/x86/matmul_compute.h
@@ -56,7 +56,7 @@ class MatMulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto *x = param.X;
     auto *y = param.Y;
     auto *out = param.Out;
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
 
     auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
     auto mat_dim_a = lite::x86::math::CreateMatrixDescriptor(
diff --git a/lite/kernels/x86/mul_compute.h b/lite/kernels/x86/mul_compute.h
index be58f24ba2ed37db6661ecaaceb0d9d70fdd75d4..5c3dbe9342c8642470e8997fc2fec6428c2aa832 100644
--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
@@ -64,7 +64,7 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
       y_matrix = *y;
     }
 
-    z->mutable_data<T>();
+    z->template mutable_data<T>();
     auto z_dim = z->dims();
     if (z_dim.size() != 2) {
       z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
diff --git a/lite/kernels/x86/reduce_compute.h b/lite/kernels/x86/reduce_compute.h
index f93157c837995792772c86d969312bfa28341ce4..1b7c99eeef9dd80525eb9ed249bdf6ed1e493443 100644
--- a/lite/kernels/x86/reduce_compute.h
+++ b/lite/kernels/x86/reduce_compute.h
@@ -49,7 +49,7 @@ class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     bool reduce_all = param.reduce_all;
     auto* input = param.x;
     auto* output = param.output;
-    param.output->mutable_data<T>();
+    param.output->template mutable_data<T>();
 
     const auto& dims = param.dim;
     bool keep_dim = param.keep_dim;
diff --git a/lite/kernels/x86/scale_compute.h b/lite/kernels/x86/scale_compute.h
index c78f385b96dd2bdbf83204f2a80739657350ae7e..978a81fb22f382f9f036e503e3f674d38f1467a6 100644
--- a/lite/kernels/x86/scale_compute.h
+++ b/lite/kernels/x86/scale_compute.h
@@ -41,8 +41,8 @@ class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
-    scale_compute(param.x->data<T>(),
-                  param.output->mutable_data<T>(),
+    scale_compute(param.x->template data<T>(),
+                  param.output->template mutable_data<T>(),
                   param.x->dims().production(),
                   param.scale,
                   param.bias,
diff --git a/lite/kernels/x86/search_grnn_compute.cc b/lite/kernels/x86/search_grnn_compute.cc
index 95839ba71b9f63fad9d659fd65c0028005d29799..f25c960f19b60056bd9702a31774a378378f24d6 100644
--- a/lite/kernels/x86/search_grnn_compute.cc
+++ b/lite/kernels/x86/search_grnn_compute.cc
@@ -84,7 +84,7 @@ void SearchGrnnCompute<T>::PrepareLayout(const Tensor* input_blob) {
   int max_width = width_data[idx_sorted_by_width_data[0]];
 
   // start of reorganizing the input
-  std::vector<size_t> new_offset;
+  std::vector<uint64_t> new_offset;
   new_offset.resize(max_width + 1);
 
   new_offset[0] = 0;
diff --git a/lite/kernels/x86/search_group_padding_compute.h b/lite/kernels/x86/search_group_padding_compute.h
index 17244d15d9124d9d61d1f4fdef4f12590958c0be..eee2a8ac8ef757d776580eac9dfc2c6e31694107 100644
--- a/lite/kernels/x86/search_group_padding_compute.h
+++ b/lite/kernels/x86/search_group_padding_compute.h
@@ -50,7 +50,7 @@ class SearchGroupPaddingCompute
       }
     }
 
-    std::vector<size_t> new_offset;
+    std::vector<uint64_t> new_offset;
     new_offset.resize(batch + 1);
     for (int i = 0; i < batch + 1; ++i) {
       new_offset[i] = i * max_seq;
@@ -67,7 +67,7 @@ class SearchGroupPaddingCompute
     top1_lod.push_back(offset);
     top1->set_lod(top1_lod);
     top1->Resize({dim0, 1});
-    memset(top1->mutable_data<T>(),
+    memset(top1->template mutable_data<T>(),
            0,
            top1->dims()[0] * top1->dims()[1] * sizeof(T));
     // for padding input id
@@ -76,9 +76,9 @@ class SearchGroupPaddingCompute
     top2->set_lod(top2_lod);
     top2->Resize({batch * max_seq, 1});
     // copy data
-    const auto* bottom_data = bottom0->data<T>();
-    auto* top_data = top0->mutable_data<T>();
-    auto* top_padding_input_data = top2->mutable_data<T>();
+    const auto* bottom_data = bottom0->template data<T>();
+    auto* top_data = top0->template mutable_data<T>();
+    auto* top_padding_input_data = top2->template mutable_data<T>();
     for (int i = 0; i < batch; i++) {
       const int copy_step = offset[i + 1] - offset[i];
       const int start = i * max_seq;
diff --git a/lite/kernels/x86/search_seq_fc_compute.h b/lite/kernels/x86/search_seq_fc_compute.h
index 80ef54b30b762848eceb16940c9f60ef8ba96927..0f19466e0862e36e744fe74d985ab6136dee0e8d 100644
--- a/lite/kernels/x86/search_seq_fc_compute.h
+++ b/lite/kernels/x86/search_seq_fc_compute.h
@@ -58,8 +58,10 @@ class SearchSeqFcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
       int M = x_dims[0];
       int N = w_dims[0];
       for (int i = 0; i < M; i++) {
-        blas.AXPY(
-            N, static_cast<T>(1), b->data<T>(), out->mutable_data<T>() + i * N);
+        blas.AXPY(N,
+                  static_cast<T>(1),
+                  b->template data<T>(),
+                  out->template mutable_data<T>() + i * N);
       }
     }
   }
diff --git a/lite/kernels/x86/sequence_arithmetic_compute.h b/lite/kernels/x86/sequence_arithmetic_compute.h
index 88510b8b1c7a04ab01da9af331f9d1f72765b215..080d0bcd0b42f6f59266e56d0f729eb2a28d4179 100644
--- a/lite/kernels/x86/sequence_arithmetic_compute.h
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
@@ -39,9 +39,9 @@ class SequenceArithmeticCompute
     out->Resize(x->dims());
     out->set_lod(x->lod());
 
-    auto x_data = x->data<T>();
-    auto y_data = y->data<T>();
-    auto out_data = out->mutable_data<T>();
+    auto x_data = x->template data<T>();
+    auto y_data = y->template data<T>();
+    auto out_data = out->template mutable_data<T>();
     auto x_seq_offset = x->lod()[0];
     auto y_seq_offset = y->lod()[0];
     int seq_num = x_seq_offset.size() - 1;
diff --git a/lite/kernels/x86/sequence_concat_compute.h b/lite/kernels/x86/sequence_concat_compute.h
index 8dd7077f7dbbb3e61f21d63e8c935157b3d2d579..cbf8a41b7e2228d3b2fab3fe5049281850961c1e 100644
--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
@@ -25,7 +25,7 @@ namespace x86 {
 template <typename T>
 inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
                      std::vector<lite::Tensor>* xs_in_order) {
-  std::vector<size_t> result;
+  std::vector<uint64_t> result;
   result.resize(xs[0]->lod()[0].size());
 
   for (size_t i = 1; i < result.size(); ++i) {
@@ -75,7 +75,7 @@ class SequenceConcatCompute
     out_dims[0] = batch_size;
     param.Out->Resize(out_dims);
 
-    T* dout = param.Out->mutable_data<T>();
+    T* dout = param.Out->template mutable_data<T>();
 
     std::vector<lite::Tensor> x_in_order;
     param.Out->set_lod(ConcatLoD<T>(param.X, &x_in_order));
diff --git a/lite/kernels/x86/sequence_concat_compute_test.cc b/lite/kernels/x86/sequence_concat_compute_test.cc
index be1f86a5c848b5c03634ea2a1aed0d57f2283879..eb6678a655ed1eb5a7bcda1dc2a6b8afe4477d2d 100644
--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
@@ -26,7 +26,7 @@ namespace x86 {
 namespace {
 inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
                      std::vector<lite::Tensor>* xs_in_order) {
-  std::vector<size_t> result;
+  std::vector<uint64_t> result;
   result.resize(xs[0]->lod()[0].size());
 
   for (size_t i = 1; i < result.size(); ++i) {
diff --git a/lite/kernels/x86/sequence_conv_compute.cc b/lite/kernels/x86/sequence_conv_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32bf8b315c7952a74846af5c4e5548767c80e63e
--- /dev/null
+++ b/lite/kernels/x86/sequence_conv_compute.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_conv_compute.h"
+
+REGISTER_LITE_KERNEL(sequence_conv,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SequenceConvCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_conv_compute.h b/lite/kernels/x86/sequence_conv_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1a47aa20f4886aa5dddbe6b398e5365abdc16f2
--- /dev/null
+++ b/lite/kernels/x86/sequence_conv_compute.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <algorithm>
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/backends/x86/math/context_project.h"
+#include "lite/backends/x86/math/math_function.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+namespace math = paddle::lite::x86::math;
+
+template <typename T>
+class SequenceConvCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceConvParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto& ctx = this->ctx_->template As<X86Context>();
+
+    auto* in = param.X;
+    auto* filter = param.Filter;
+    auto* out = param.Out;
+    out->template mutable_data<T>();
+    CHECK(in->lod().size() == 1) << "Only support one level sequence now";
+
+    int context_start = param.contextStart;
+    int context_stride = param.contextStride;
+    int context_length = param.contextLength;
+    bool padding_trainable = false;
+    const Tensor* padding_data = nullptr;
+
+    int up_pad = std::max(0, -context_start);
+    int down_pad = std::max(0, context_start + context_length - 1);
+    auto sequence_width = static_cast<int64_t>(in->dims()[1]);
+
+    std::vector<int64_t> col_shape{in->dims()[0],
+                                   context_length * sequence_width};
+    Tensor col;
+    col.Resize(col_shape);
+    col.mutable_data<T>();
+
+    // Because if padding_trainable is false, padding data should be zeros.
+    math::SetConstant<TARGET(kX86), T> set_zero;
+    auto blas = math::GetBlas<TARGET(kX86), T>(ctx);
+    set_zero(ctx, &col, static_cast<T>(0));
+    math::ContextProjectFunctor<TARGET(kX86), T> seq_project_functor;
+
+    seq_project_functor(ctx,
+                        *in,
+                        padding_data,
+                        padding_trainable,
+                        context_start,
+                        context_length,
+                        context_stride,
+                        up_pad,
+                        down_pad,
+                        &col);
+
+    blas.MatMul(col, *filter, out);
+  }
+
+  virtual ~SequenceConvCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/sequence_expand_as_compute.h b/lite/kernels/x86/sequence_expand_as_compute.h
index 16759c1b9f1d136d5aaf58d4531882ab6a2618a2..badbfac14cbeb120d23ea1174a9fc3a218b2224f 100644
--- a/lite/kernels/x86/sequence_expand_as_compute.h
+++ b/lite/kernels/x86/sequence_expand_as_compute.h
@@ -29,9 +29,10 @@ using Tensor = lite::Tensor;
 
 template <typename T>
 struct SequenceExpandFunctor {
-  void operator()(const Tensor &x,
-                  const std::vector<size_t> &ref_lod, /*expand referenced lod*/
-                  Tensor *out) {
+  void operator()(
+      const Tensor &x,
+      const std::vector<uint64_t> &ref_lod, /*expand referenced lod*/
+      Tensor *out) {
     int64_t hight = x.dims()[0];
     int64_t width = x.data_size() / hight;
 
@@ -39,13 +40,13 @@ struct SequenceExpandFunctor {
     T *out_data = out->mutable_data<T, T>();
 
     for (int h_id = 0; h_id < hight; ++h_id) {
-      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
+      uint64_t span = ref_lod[h_id + 1] - ref_lod[h_id];
       if (span == 0) continue;
       const T *src = in_data + h_id * width;
-      for (int64_t w_id = 0; w_id < width; ++w_id) {
+      for (uint64_t w_id = 0; w_id < width; ++w_id) {
         T ele = src[w_id];
         size_t offset = ref_lod[h_id] * width;
-        for (size_t k = 0; k < span; ++k) {
+        for (uint64_t k = 0; k < span; ++k) {
           out_data[offset + k * width + w_id] = ele;
         }
       }
@@ -68,7 +69,7 @@ class SequenceExpandAsCompute
     CHECK_EQ(y_lod.size(), 1);
     CHECK_GT(y_lod[0].size(), 1);
 
-    out->mutable_data<T, T>();
+    out->template mutable_data<T, T>();
 
     SequenceExpandFunctor<T> seq_espand_functor;
     seq_espand_functor(*x, y_lod[0], out);
diff --git a/lite/kernels/x86/sequence_pool_compute.h b/lite/kernels/x86/sequence_pool_compute.h
index 329a76658d342078ed5d708125d9ff01e0ecef02..20e0307cef2347ce68237f70c990362bbaa210e7 100644
--- a/lite/kernels/x86/sequence_pool_compute.h
+++ b/lite/kernels/x86/sequence_pool_compute.h
@@ -40,7 +40,7 @@ class SequencePoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
     dims[0] = lod[0].size() - 1;
     out->Resize({dims});
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
     lite::Tensor* index = nullptr;
 
     const bool is_test = true;
diff --git a/lite/kernels/x86/sequence_reshape_compute.h b/lite/kernels/x86/sequence_reshape_compute.h
index 99f84ebd06e1f5742bbaee9f98ec17aee44bd871..d166f8bc3d80d9f87efb0315462daee3296f393f 100644
--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
@@ -64,9 +64,9 @@ class SequenceReshapeCompute
 
     out->Resize(std::vector<int64_t>{static_cast<int64_t>(out->lod()[0].back()),
                                      out_width});
-    auto* dst_ptr = out->mutable_data<T>();
+    auto* dst_ptr = out->template mutable_data<T>();
     auto size = in->numel() * sizeof(T);
-    std::memcpy(dst_ptr, in->data<T>(), size);
+    std::memcpy(dst_ptr, in->template data<T>(), size);
   }
 
   virtual ~SequenceReshapeCompute() = default;
diff --git a/lite/kernels/x86/sequence_unpad_compute.cc b/lite/kernels/x86/sequence_unpad_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..430f3c47c60b8f5a506ff1191a118db754f1dffe
--- /dev/null
+++ b/lite/kernels/x86/sequence_unpad_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_unpad_compute.h"
+
+REGISTER_LITE_KERNEL(sequence_unpad,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SequenceUnpadCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Length",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_unpad_compute.h b/lite/kernels/x86/sequence_unpad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b4e3f6c1638975ec042598942363f516ddf3bb9
--- /dev/null
+++ b/lite/kernels/x86/sequence_unpad_compute.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/sequence_padding.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+namespace math = paddle::lite::x86::math;
+
+template <typename T>
+class SequenceUnpadCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceUnpadParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto& ctx = this->ctx_->template As<X86Context>();
+
+    param.Out->template mutable_data<T>();
+    int64_t padded_length = param.X->dims()[1];
+    math::UnpaddingLoDTensorFunctor<lite::TargetType::kX86, T>()(
+        ctx,
+        *param.X,
+        param.Out,
+        padded_length,
+        0,
+        false,
+        math::kBatchLengthWidth);
+  }
+
+  virtual ~SequenceUnpadCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/shape_compute.h b/lite/kernels/x86/shape_compute.h
index ee3678a7f1c6651226c479aeedcacce91085b295..e78684e629727fc7023e6ae4c3385f9c58d48a6b 100644
--- a/lite/kernels/x86/shape_compute.h
+++ b/lite/kernels/x86/shape_compute.h
@@ -29,7 +29,7 @@ class ShapeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   void Run() override {
     auto& param = *param_.get_mutable<operators::ShapeParam>();
     // auto& context = context_->As<X86Context>();
-    auto out_data = param.Out->mutable_data<int32_t>();
+    auto out_data = param.Out->template mutable_data<int32_t>();
     auto in_dims = param.X->dims();
     for (int i = 0; i < in_dims.size(); ++i) {
       out_data[i] = in_dims[i];
diff --git a/lite/kernels/x86/softmax_compute.h b/lite/kernels/x86/softmax_compute.h
index 5a18a8022773682c0853a3592a9925f3a6015e83..3abc15145bde35a2c442daa9feff7137bcb40fb4 100644
--- a/lite/kernels/x86/softmax_compute.h
+++ b/lite/kernels/x86/softmax_compute.h
@@ -58,7 +58,7 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
     auto* x = param.x;
     auto* output = param.output;
-    output->mutable_data<T>();
+    output->template mutable_data<T>();
 
     const int rank = x->dims().size();
     const int axis = CanonicalAxis(param.axis, rank);
diff --git a/lite/kernels/x86/squeeze_compute.h b/lite/kernels/x86/squeeze_compute.h
index 67086f8c732d412064c6bb0bece7e8208f8a0799..3288421c14447a348efd63c8cc5ea4de9bd2b24e 100644
--- a/lite/kernels/x86/squeeze_compute.h
+++ b/lite/kernels/x86/squeeze_compute.h
@@ -35,8 +35,8 @@ class SqueezeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto x = param.X;
     auto output = param.Out;
     auto x_dims = x->dims();
-    auto* x_data = x->data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* x_data = x->template data<T>();
+    auto* out_data = output->template mutable_data<T>();
     memcpy(out_data, x_data, x_dims.production() * sizeof(T));
   }
 
@@ -54,9 +54,9 @@ class Squeeze2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto output = param.Out;
     auto xshape = param.XShape;
     auto x_dims = x->dims();
-    auto* x_data = x->data<T>();
-    auto* out_data = output->mutable_data<T>();
-    auto* xshape_data = xshape->mutable_data<T>();
+    auto* x_data = x->template data<T>();
+    auto* out_data = output->template mutable_data<T>();
+    auto* xshape_data = xshape->template mutable_data<T>();
     memcpy(out_data, x_data, x_dims.production() * sizeof(T));
     memcpy(xshape_data, x_data, x_dims.production() * sizeof(T));
   }
diff --git a/lite/kernels/x86/stack_compute.h b/lite/kernels/x86/stack_compute.h
index 12a6c3490eff9d446de96366c8dd5fe6b2a4bd06..08b3515948750a5cb36627f0349c852e597619e6 100644
--- a/lite/kernels/x86/stack_compute.h
+++ b/lite/kernels/x86/stack_compute.h
@@ -40,9 +40,9 @@ class StackCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     if (axis < 0) axis += (x[0]->dims().size() + 1);
 
     int n = static_cast<int>(x.size());
-    auto y_data = y->mutable_data<T>();
+    auto y_data = y->template mutable_data<T>();
     std::vector<const T*> x_datas(n);
-    for (int i = 0; i < n; ++i) x_datas[i] = x[i]->data<T>();
+    for (int i = 0; i < n; ++i) x_datas[i] = x[i]->template data<T>();
 
     int pre = 1, post = 1;
     auto dim = x[0]->dims();
diff --git a/lite/kernels/x86/transpose_compute.h b/lite/kernels/x86/transpose_compute.h
index 603b96015e267aa24d20bf20f2c3090a2daab74c..5f6faed2017b6bdef60e7505bf1f0088d86b3ec1 100644
--- a/lite/kernels/x86/transpose_compute.h
+++ b/lite/kernels/x86/transpose_compute.h
@@ -73,7 +73,7 @@ class TransposeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& param = *param_.get_mutable<param_t>();
     auto* x = param.x;
     auto* out = param.output;
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
     int ndims = param.axis.size();
     auto& context = ctx_->As<X86Context>();
     TransCompute<lite::TargetType::kX86, T>(
@@ -92,7 +92,7 @@ class Transpose2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& param = *param_.get_mutable<param_t>();
     auto* x = param.x;
     auto* out = param.output;
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
     int ndims = param.axis.size();
     auto& context = ctx_->As<X86Context>();
     TransCompute<lite::TargetType::kX86, T>(
diff --git a/lite/kernels/x86/uniform_random_compute.cc b/lite/kernels/x86/uniform_random_compute.cc
index 64a701d4c67a9bf908f7fc87e9923f22dde811e3..45c1c08d46e5a23857547aac15b952a1123e741f 100644
--- a/lite/kernels/x86/uniform_random_compute.cc
+++ b/lite/kernels/x86/uniform_random_compute.cc
@@ -34,8 +34,8 @@ class UniformRandomCompute
 
     auto *param_out = &param.Out->raw_tensor();
 
-    T *data =
-        param_out->mutable_data<T>(context.x86_device_context()->GetPlace());
+    T *data = param_out->template mutable_data<T>(
+        context.x86_device_context()->GetPlace());
 
     unsigned int seed = static_cast<unsigned int>(param.seed);
     std::minstd_rand engine;
diff --git a/lite/kernels/x86/var_conv_2d_compute.h b/lite/kernels/x86/var_conv_2d_compute.h
index 7a9ba16d2ea87adb40df23e1fbe149ab805afbc8..1bed39f479c87636ff217c8fd7234ea2c5bd5904 100644
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
@@ -80,7 +80,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     std::vector<int64_t> col_dims_vec{top_size};
     col_dims_vec.push_back(1);
     col->Resize(col_dims_vec);
-    auto* top_data = col->mutable_data<T>();
+    auto* top_data = col->template mutable_data<T>();
     const auto* bottom_data = input.data<T>();
 
     int kernel_win_size = kernel_h * kernel_w;
@@ -149,7 +149,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     // const auto& offset_y = in_row->lod()[0];
     const auto& offset_y = param.X->lod()[1];
     const auto& offset_x = param.X->lod()[2];
-    std::vector<size_t> top_offset;
+    std::vector<uint64_t> top_offset;
     int top_size = 0;
     top_offset.push_back(top_size);
     for (int b = 0; b < batch; ++b) {
@@ -178,9 +178,9 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     std::vector<int64_t> top_dims_vec{top_size};
     top_dims_vec.push_back(1);
     top->Resize(top_dims_vec);
-    auto* top_data = top->mutable_data<T>();
-    const auto* w_data = w->data<T>();
-    const auto* col_data = col->data<T>();
+    auto* top_data = top->template mutable_data<T>();
+    const auto* w_data = w->template data<T>();
+    const auto* col_data = col->template data<T>();
 
     auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
     for (int b = 0; b < batch; ++b) {
diff --git a/lite/kernels/x86/var_conv_2d_compute_test.cc b/lite/kernels/x86/var_conv_2d_compute_test.cc
index d6ae5a67bfc9deba1fb097fa5c0c0cf323b65e48..edef8cb2df75dfb45ad4964975365d4ddbbe9086 100644
--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
@@ -140,7 +140,7 @@ static void var_conv_2d_ref(const lite::Tensor* bottom,
   const auto& col_offset = col->lod()[0];
   const auto& offset_x = in_col->lod()[0];
   const auto& offset_y = in_row->lod()[0];
-  std::vector<size_t> top_offset;
+  std::vector<uint64_t> top_offset;
   int top_size = 0;
   top_offset.push_back(top_size);
   for (int b = 0; b < batch; ++b) {
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index d9c6de358650d5bc84e12762198988c0e46e34bf..07dc127695e3906719b45020a585966877bec868 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -1,4 +1,27 @@
+if(NOT LITE_WITH_XPU)
+  return()
+endif()
 
-add_subdirectory(bridges)
-
-add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
+if(LITE_WITH_XTCL)
+  add_subdirectory(bridges)
+  add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
+else()
+  add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} target_wrapper_xpu)
+  add_kernel(batch_norm_compute_xpu XPU basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(activation_compute_xpu XPU basic SRCS activation_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(pool_compute_xpu XPU basic SRCS pool_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(elementwise_compute_xpu XPU basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(mul_compute_xpu XPU basic SRCS mul_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(softmax_compute_xpu XPU basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(scale_compute_xpu XPU basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(lookup_table_compute_xpu XPU basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(layer_norm_compute_xpu XPU basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(dropout_compute_xpu XPU basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(matmul_compute_xpu XPU basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__multi_encoder_compute_xpu XPU extra SRCS __xpu__multi_encoder_compute.cc DEPS ${lite_kernel_deps})
+endif()
diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0ba33110d2b3efd4a5e164da86ea949c95bbb63
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__multi_encoder_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUMultiEncoderCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  for (auto* fc_weight : param.fc_weight) {
+    arg_fc_weight_.push_back(
+        reinterpret_cast<const int16_t*>(fc_weight->data<float>()));
+  }
+  for (auto* fc_bias : param.fc_bias) {
+    arg_fc_bias_.push_back(fc_bias->data<float>());
+  }
+  for (auto* ln_scale : param.ln_scale) {
+    arg_ln_scale_.push_back(ln_scale->data<float>());
+  }
+  for (auto* ln_bias : param.ln_bias) {
+    arg_ln_bias_.push_back(ln_bias->data<float>());
+  }
+  if (param.act_type == "relu") {
+    act_type_ = xdnn::Activation_t::RELU;
+  }
+}
+
+void XPUMultiEncoderCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int batch_size = param.input->dims()[0];
+  int seq_len = param.input->dims()[1];
+  int r = xdnn::bert_encoder_transformer_int16<int16_t>(
+      ctx.GetRawContext(),                             /* context */
+      batch_size,                                      /* batch_size */
+      seq_len,                                         /* from_seq_len */
+      seq_len,                                         /* to_seq_len */
+      param.head_num,                                  /* head_num */
+      param.size_per_head,                             /* size_per_head */
+      param.n_layers,                                  /* n_layers */
+      param.input->data<float>(),                      /* from_tensor */
+      param.input->data<float>(),                      /* to_tensor */
+      param.mask->data<float>(),                       /* att_mask */
+      &arg_fc_weight_[0],                              /* fc_weights */
+      &arg_fc_bias_[0],                                /* fc_biass */
+      &arg_ln_scale_[0],                               /* ln_scales */
+      &arg_ln_bias_[0],                                /* ln_biass */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* output */
+      param.fc_weight_max->data<float>(),              /* fc_weights_max */
+      true,                                            /* pretrans_b */
+      true,                                            /* use_l3 */
+      act_type_ /* act_type */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__multi_encoder,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMultiEncoderCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("FCWeight", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("FCBias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("LNScale", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("LNBias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("FCWeightMax", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.h b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..71db4e6f44f9c36e4acdaf0a440463a61f4e3099
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUMultiEncoderCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMultiEncoderParam;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+ private:
+  std::vector<const int16_t *> arg_fc_weight_;
+  std::vector<const float *> arg_fc_bias_;
+  std::vector<const float *> arg_ln_scale_;
+  std::vector<const float *> arg_ln_bias_;
+  xdnn::Activation_t act_type_{xdnn::Activation_t::GELU};
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/__xpu__resnet50_compute.cc b/lite/kernels/xpu/__xpu__resnet50_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e63e03fc9c1d52be42a8ff9b1d6260b3396a2fe
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__resnet50_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUResNet50Compute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  for (auto* filter : param.filter) {
+    arg_filter_.push_back(
+        reinterpret_cast<const int16_t*>(filter->data<float>()));
+  }
+  for (auto* bias : param.bias) {
+    arg_bias_.push_back(bias->data<float>());
+  }
+  for (auto* max_filter : param.max_filter) {
+    arg_max_filter_.push_back(max_filter->data<float>());
+  }
+}
+
+void XPUResNet50Compute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int batch_size = param.input->dims()[0];
+  int r = xdnn::conv2d_int16_resnet<float, int16_t>(
+      ctx.GetRawContext(),                             /* context */
+      batch_size,                                      /* num */
+      param.input->data<float>(),                      /* bottom */
+      &arg_filter_[0],                                 /* weight_list */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* top */
+      &arg_bias_[0],                                   /* bias_list */
+      &arg_max_filter_[0] /* max_filter_list */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__resnet50,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUResNet50Compute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__resnet50_compute.h b/lite/kernels/xpu/__xpu__resnet50_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d42f8b6f26edf615dba165b553b633673a4ae66
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUResNet50Compute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUResNet50Param;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+ private:
+  std::vector<const int16_t *> arg_filter_;
+  std::vector<const float *> arg_max_filter_;
+  std::vector<const float *> arg_bias_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/activation_compute.cc b/lite/kernels/xpu/activation_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a46b33252e40a56299ebc7d0f133520a04b7cb20
--- /dev/null
+++ b/lite/kernels/xpu/activation_compute.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/activation_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ReluCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),      /* context */
+      xdnn::Activation_t::RELU, /* type */
+      param.X->numel(),         /* len */
+      param.X->data<float>(),   /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void TanhCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),      /* context */
+      xdnn::Activation_t::TANH, /* type */
+      param.X->numel(),         /* len */
+      param.X->data<float>(),   /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void SigmoidCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),         /* context */
+      xdnn::Activation_t::SIGMOID, /* type */
+      param.X->numel(),            /* len */
+      param.X->data<float>(),      /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    relu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ReluCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    tanh, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TanhCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sigmoid,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SigmoidCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/arm/compare_compute.h b/lite/kernels/xpu/activation_compute.h
similarity index 55%
rename from lite/kernels/arm/compare_compute.h
rename to lite/kernels/xpu/activation_compute.h
index 278ce1500f473e53f1f1a21a461ff9f786c94ce5..e440bde4146a88929c52c20ff1038eb35be91d38 100644
--- a/lite/kernels/arm/compare_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
@@ -13,43 +13,41 @@
 // limitations under the License.
 
 #pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
 #include "lite/core/kernel.h"
-#include "lite/operators/compare_op.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace xpu {
 
-template <template <typename T> class Functor>
-class CompareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class ReluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
  public:
-  void Run() override;
+  using param_t = operators::ActivationParam;
 
-  ~CompareCompute() {}
+  virtual void Run();
+
+  virtual ~ReluCompute() = default;
 };
 
-template <template <typename T> class Functor>
-class CompareCompute_int32
-    : public KernelLite<TARGET(kARM), PRECISION(kInt32)> {
+class TanhCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
  public:
-  void Run() override;
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
 
-  ~CompareCompute_int32() {}
+  virtual ~TanhCompute() = default;
 };
 
-template <template <typename T> class Functor>
-class CompareCompute_int64
-    : public KernelLite<TARGET(kARM), PRECISION(kInt64)> {
+class SigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
  public:
-  void Run() override;
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
 
-  ~CompareCompute_int64() {}
+  virtual ~SigmoidCompute() = default;
 };
 
-}  // namespace arm
+}  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/xpu/batch_norm_compute.cc b/lite/kernels/xpu/batch_norm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b3139165a06fd0f42897e9ed6c98d80d27adeab
--- /dev/null
+++ b/lite/kernels/xpu/batch_norm_compute.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/batch_norm_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void BatchNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  float epsilon = param.epsilon;
+  auto& x_dims = param.x->dims();
+
+  int r = xdnn::batch_norm_infer_forward(
+      ctx.GetRawContext(),                        /* context */
+      epsilon,                                    /* epsilon */
+      x_dims[0],                                  /* img_n */
+      x_dims[1],                                  /* img_c */
+      x_dims[2],                                  /* img_h */
+      x_dims[3],                                  /* img_w */
+      param.x->data<float>(),                     /* img_gm */
+      param.y->mutable_data<float>(TARGET(kXPU)), /* out_gm */
+      param.scale->data<float>(),                 /* scale_gm */
+      param.bias->data<float>(),                  /* bias_gm */
+      param.mean->data<float>(),                  /* mean_gm */
+      param.variance->data<float>() /* var__gm */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(batch_norm,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::BatchNormCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/batch_norm_compute.h b/lite/kernels/xpu/batch_norm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b428476b96ca3b2b60c66df28b7f82e8f57bebc
--- /dev/null
+++ b/lite/kernels/xpu/batch_norm_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class BatchNormCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BatchNormParam;
+
+  virtual void Run();
+
+  virtual ~BatchNormCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/CMakeLists.txt b/lite/kernels/xpu/bridges/CMakeLists.txt
index 29cb83b2b853d4953bfbe7faca8633f2789e1d50..0d6d708952b0806da7b060bb76b3ce35df352c26 100644
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LITE_WITH_XPU)
+if(NOT LITE_WITH_XTCL)
   return()
 endif()
 
@@ -25,6 +25,7 @@ lite_cc_library(subgraph_bridge_layer_norm_op_xpu SRCS layer_norm_op.cc DEPS ${x
 lite_cc_library(subgraph_bridge_dropout_op_xpu SRCS dropout_op.cc DEPS ${xpu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_matmul_op_xpu SRCS matmul_op.cc DEPS ${xpu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_cast_op_xpu SRCS cast_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_yolo_box_op_xpu SRCS yolo_box_op.cc DEPS ${xpu_subgraph_bridge_deps})
 
 set(xpu_subgraph_bridges
         subgraph_bridge_registry
@@ -48,6 +49,7 @@ set(xpu_subgraph_bridges
         subgraph_bridge_dropout_op_xpu
         subgraph_bridge_matmul_op_xpu
         subgraph_bridge_cast_op_xpu
+        subgraph_bridge_yolo_box_op_xpu
         CACHE INTERNAL "xpu_subgraph_bridges")
 
 message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}")
diff --git a/lite/kernels/xpu/bridges/graph.h b/lite/kernels/xpu/bridges/graph.h
index dafd8d853210278220b79fdf58895484cbd89ec0..562e5fea9eef92fae306fe4bb48a4e224b3c76ee 100644
--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
-#include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 
diff --git a/lite/kernels/xpu/bridges/paddle_use_bridges.h b/lite/kernels/xpu/bridges/paddle_use_bridges.h
index 0c7886c5b2b431db7ba97d8557fb6a49750bd468..cf896426f7a40ae17cd73547071f86dcfa738839 100644
--- a/lite/kernels/xpu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h
@@ -37,3 +37,4 @@ USE_SUBGRAPH_BRIDGE(gelu, kXPU);
 USE_SUBGRAPH_BRIDGE(dropout, kXPU);
 USE_SUBGRAPH_BRIDGE(matmul, kXPU);
 USE_SUBGRAPH_BRIDGE(cast, kXPU);
+USE_SUBGRAPH_BRIDGE(yolo_box, kXPU);
diff --git a/lite/kernels/xpu/bridges/utility.h b/lite/kernels/xpu/bridges/utility.h
index 776955854567b919234e7c79dcf6321e8e24b70a..0deb4fd7b4723d97a9159a88c6d8a054a047dc92 100644
--- a/lite/kernels/xpu/bridges/utility.h
+++ b/lite/kernels/xpu/bridges/utility.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 
diff --git a/lite/kernels/xpu/bridges/yolo_box_op.cc b/lite/kernels/xpu/bridges/yolo_box_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1b7c014702aa8530d5b502bb6d32825e7bb13b2
--- /dev/null
+++ b/lite/kernels/xpu/bridges/yolo_box_op.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int YoloBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+
+  auto img_size_name = op_info->Input("ImgSize").front();
+  auto img_size = scope->FindTensor(img_size_name);
+
+  auto boxes_name = op_info->Output("Boxes").front();
+  auto scores_name = op_info->Output("Scores").front();
+
+  auto anchors = op_info->GetAttr<std::vector<int>>("anchors");
+  auto class_num = op_info->GetAttr<int>("class_num");
+  auto conf_thresh = op_info->GetAttr<float>("conf_thresh");
+  auto downsample_ratio = op_info->GetAttr<int>("downsample_ratio");
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // ImgSize node
+  std::shared_ptr<Node> img_size_node = nullptr;
+  if (graph->Has(img_size_name)) {
+    img_size_node = graph->Get(img_size_name);
+  } else {
+    img_size_node = graph->Add(img_size_name, *img_size);
+  }
+
+  // Softmax node
+  auto yolo_box_data =
+      graph->builder_.CreateYoloBox(*x_node->data(),
+                                    *img_size_node->data(),
+                                    CvtShape<xtcl::Integer>(anchors),
+                                    class_num,
+                                    conf_thresh,
+                                    downsample_ratio);
+  graph->Add(boxes_name, graph->builder_.GetField(yolo_box_data, 0));
+  graph->Add(scores_name, graph->builder_.GetField(yolo_box_data, 1));
+
+  return SUCCESS;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(yolo_box,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::YoloBoxConverter);
diff --git a/lite/kernels/xpu/cast_compute.cc b/lite/kernels/xpu/cast_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7eabd28a16073db218dcd03542bac0d1e3459be
--- /dev/null
+++ b/lite/kernels/xpu/cast_compute.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/cast_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename InType>
+void CastCompute<InType>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* x = param.X;
+  auto* out = param.Out;
+  int out_dtype = param.out_dtype;
+  auto* in_data = x->template data<InType>();
+  int numel = x->numel();
+
+  int r = 0;
+  // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
+  // SIZE_T = 19;UINT8 = 20;INT8 = 21;
+  if (out_dtype == 5) {
+    auto* out_data = out->template mutable_data<float>(TARGET(kXPU));
+    r = xdnn::cast<InType, float>(
+        ctx.GetRawContext(), in_data, out_data, numel);
+  } else if (out_dtype == 2) {
+    auto* out_data = out->template mutable_data<int>(TARGET(kXPU));
+    r = xdnn::cast<InType, int>(ctx.GetRawContext(), in_data, out_data, numel);
+  } else if (out_dtype == 3) {
+    auto* out_data = out->template mutable_data<int64_t>(TARGET(kXPU));
+    r = xdnn::cast<InType, int64_t>(
+        ctx.GetRawContext(), in_data, out_data, numel);
+  } else {
+    CHECK(false);
+  }
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(cast,
+                     kXPU,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::CastCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/xpu/cast_compute.h b/lite/kernels/xpu/cast_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..8992c29732630a5bf0d9c092461569234257e3a9
--- /dev/null
+++ b/lite/kernels/xpu/cast_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename InType>
+class CastCompute : public KernelLite<TARGET(kXPU), PRECISION(kAny)> {
+ public:
+  using param_t = operators::CastParam;
+
+  void Run() override;
+
+  virtual ~CastCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/conv_compute.cc b/lite/kernels/xpu/conv_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ed692fd0e2d474cbe5ce9f06633280bb09c3878c
--- /dev/null
+++ b/lite/kernels/xpu/conv_compute.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/conv_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <>
+void Conv2dCompute<PRECISION(kFloat)>::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.x->dims();
+  auto& w_dims = param.filter->dims();
+  int groups = param.groups;
+  auto& strides = param.strides;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
+  int r = xdnn::conv2d_forward_int16<float, float, float, float>(
+      ctx.GetRawContext(),                             /* context */
+      x_dims[0],                                       /* num */
+      x_dims[1],                                       /* input_c */
+      x_dims[2],                                       /* input_h */
+      x_dims[3],                                       /* input_w */
+      w_dims[0],                                       /* num_filter */
+      w_dims[2],                                       /* kernel_h */
+      w_dims[3],                                       /* kernel_w */
+      strides[0],                                      /* stride_h */
+      strides[1],                                      /* stride_w */
+      paddings[0],                                     /* pad_h */
+      paddings[1],                                     /* pad_w */
+      dilations[0],                                    /* dilation_h */
+      dilations[1],                                    /* dilation_w */
+      groups,                                          /* group */
+      param.x->data<float>(),                          /* bottom */
+      param.filter->data<float>(),                     /* weight */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* top */
+      nullptr,                                         /* bias */
+      nullptr,                                         /* branch */
+      xdnn::Activation_t::LINEAR,                      /* type */
+      nullptr,                                         /* max_image_ptr */
+      nullptr,                                         /* max_filter_ptr */
+      nullptr /* max_result_ptr */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace xpu = paddle::lite::kernels::xpu;
+using Conv2dFp32 = xpu::Conv2dCompute<PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(conv2d, kXPU, kFloat, kNCHW, Conv2dFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/conv_compute.h b/lite/kernels/xpu/conv_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7631ce4e5773afe7cdd797a245c806b51d25c56
--- /dev/null
+++ b/lite/kernels/xpu/conv_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <PrecisionType FilterPtype>
+class Conv2dCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
+ public:
+  using param_t = operators::ConvParam;
+
+  virtual void Run();
+
+  virtual ~Conv2dCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/dropout_compute.cc b/lite/kernels/xpu/dropout_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f42d3eeff5da40251c27476a53709aee1e65fbcf
--- /dev/null
+++ b/lite/kernels/xpu/dropout_compute.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/dropout_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void DropoutCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int size = param.x->numel() * sizeof(float);
+
+  int r = xdnn::memcpy_device(
+      ctx.GetRawContext(),                             /* context */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* dst */
+      param.x->data<float>(),                          /* src */
+      size /* size */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(dropout,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::DropoutCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/dropout_compute.h b/lite/kernels/xpu/dropout_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..0eaafb4f5555a163623402fee82d50bfa095b0b3
--- /dev/null
+++ b/lite/kernels/xpu/dropout_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class DropoutCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DropoutParam;
+
+  virtual void Run();
+
+  virtual ~DropoutCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/elementwise_compute.cc b/lite/kernels/xpu/elementwise_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e37337948bf639832ea936de2b5b929d26f534cc
--- /dev/null
+++ b/lite/kernels/xpu/elementwise_compute.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/elementwise_compute.h"
+#include <functional>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ElementwiseAddCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.X->dims().data();
+  auto& y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (param.axis == -1) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int iter = std::accumulate(
+      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
+  int stride = param.Y->numel();
+
+  for (int i = 0; i < iter; ++i) {
+    const float* x_ptr = param.X->data<float>() + i * stride;
+    const float* y_ptr = param.Y->data<float>();
+    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
+    int r = xdnn::elementwise_add(ctx.GetRawContext(), /* context */
+                                  x_ptr,               /* x */
+                                  y_ptr,               /* y */
+                                  o_ptr,               /* z */
+                                  stride /* len */);
+    CHECK_EQ(r, 0);
+  }
+}
+
+void ElementwiseSubCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.X->dims().data();
+  auto& y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (param.axis == -1) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int iter = std::accumulate(
+      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
+  int stride = param.Y->numel();
+
+  for (int i = 0; i < iter; ++i) {
+    const float* x_ptr = param.X->data<float>() + i * stride;
+    const float* y_ptr = param.Y->data<float>();
+    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
+    int r = xdnn::elementwise_sub(ctx.GetRawContext(), /* context */
+                                  x_ptr,               /* x */
+                                  y_ptr,               /* y */
+                                  o_ptr,               /* z */
+                                  stride /* len */);
+    CHECK_EQ(r, 0);
+  }
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ElementwiseAddCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ElementwiseSubCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/elementwise_compute.h b/lite/kernels/xpu/elementwise_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..863ee3c643f9c431dacd057e251941914b1dd1c5
--- /dev/null
+++ b/lite/kernels/xpu/elementwise_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  virtual void Run();
+
+  virtual ~ElementwiseAddCompute() = default;
+};
+
+class ElementwiseSubCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  virtual void Run();
+
+  virtual ~ElementwiseSubCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/io_copy_compute.cc b/lite/kernels/xpu/io_copy_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ee809563475434cfa286cc3a535bf9acac5d923
--- /dev/null
+++ b/lite/kernels/xpu/io_copy_compute.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/target_wrapper.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+/*
+ * This kernel copies a tensor from host to XPU.
+ */
+class IoCopyHostToXPUCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override {
+    auto& param = Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kHost) ||
+          param.x->target() == TARGET(kX86) ||
+          param.x->target() == TARGET(kARM));
+    auto mem_size = param.x->memory_size();
+    VLOG(4) << "host to xpu, copy size " << mem_size;
+    auto* data = param.y->mutable_data(TARGET(kXPU), mem_size);
+    TargetWrapperXPU::MemcpySync(
+        data, param.x->raw_data(), mem_size, IoDirection::HtoD);
+  }
+
+  std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
+    std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
+    *res = [](const std::map<std::string, const Type*>& inputs,
+              const std::string& out) -> const Type* {
+      CHECK(!inputs.empty());
+      auto* type = inputs.at("Input");
+      CHECK(type->target() == TARGET(kHost));
+
+      auto out_place = type->place();
+      out_place.target = TARGET(kXPU);
+      auto* out_type = Type::Get(type->id(),
+                                 out_place.target,
+                                 out_place.precision,
+                                 out_place.layout,
+                                 out_place.device);
+      return out_type;
+    };
+    return res;
+  }
+
+  std::string doc() const override { return "Copy IO from HOST to XPU"; }
+};
+
+/*
+ * This kernel copies a tensor from XPU to host.
+ */
+class IoCopyXPUToHostCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override {
+    auto& param = Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kXPU));
+    auto mem_size = param.x->memory_size();
+    VLOG(4) << "xpu to host, copy size " << mem_size;
+    auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
+    TargetWrapperXPU::MemcpySync(
+        data, param.x->raw_data(), mem_size, IoDirection::DtoH);
+  }
+
+  std::string doc() const override { return "Copy IO from XPU to HOST"; }
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(io_copy,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyHostToXPUCompute,
+                     host_to_device)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kXPU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(io_copy,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyXPUToHostCompute,
+                     device_to_host)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(io_copy_once,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyHostToXPUCompute,
+                     host_to_device)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kXPU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(io_copy_once,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyXPUToHostCompute,
+                     device_to_host)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
diff --git a/lite/kernels/xpu/layer_norm_compute.cc b/lite/kernels/xpu/layer_norm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..538ad849d93182488ca35433800f687027c02e4a
--- /dev/null
+++ b/lite/kernels/xpu/layer_norm_compute.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/layer_norm_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void LayerNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto x_dims = param.X->dims();
+  auto axis = param.begin_norm_axis;
+  auto matrix_dim = x_dims.Flatten2D(axis);
+  float epsilon = param.epsilon;
+
+  int r = xdnn::layer_norm(ctx.GetRawContext(),    /* context */
+                           matrix_dim[0],          /* m */
+                           matrix_dim[1],          /* n */
+                           param.X->data<float>(), /* in */
+                           param.Y->mutable_data<float>(TARGET(kXPU)), /* out */
+                           param.Scale->data<float>(), /* scale */
+                           param.Bias->data<float>(),  /* bias */
+                           epsilon /* epsilon */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(layer_norm,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::LayerNormCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/layer_norm_compute.h b/lite/kernels/xpu/layer_norm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d2df37795811ef8027e12b25139f2b7091cceed
--- /dev/null
+++ b/lite/kernels/xpu/layer_norm_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class LayerNormCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::LayerNormParam;
+
+  virtual void Run();
+
+  virtual ~LayerNormCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/lookup_table_compute.cc b/lite/kernels/xpu/lookup_table_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..568d303adefaa06bb8665b4cc92d4a949419d587
--- /dev/null
+++ b/lite/kernels/xpu/lookup_table_compute.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/lookup_table_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void LookupTableCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int num = param.Ids->numel();
+  int embed_dim = param.W->dims()[1];
+
+  int r = xdnn::embedding<float, int64_t>(
+      ctx.GetRawContext(),        /* context */
+      num,                        /* num */
+      param.Ids->data<int64_t>(), /* indices */
+      embed_dim,                  /* embed_dim */
+      param.W->data<float>(),     /* table */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* top */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(lookup_table,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/lookup_table_compute.h b/lite/kernels/xpu/lookup_table_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ba1afc869cf9c3a49ab1ad29c66c6c89ba87d19
--- /dev/null
+++ b/lite/kernels/xpu/lookup_table_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class LookupTableCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::LookupTableParam;
+
+  virtual void Run();
+
+  virtual ~LookupTableCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/matmul_compute.cc b/lite/kernels/xpu/matmul_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62e018889d415de8968444594804facc3292e799
--- /dev/null
+++ b/lite/kernels/xpu/matmul_compute.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/matmul_compute.h"
+#include "lite/backends/xpu/math.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+namespace math = paddle::lite::xpu::math;
+
+void MatMulCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* x = param.X;
+  auto* y = param.Y;
+  auto* out = param.Out;
+
+  auto mat_dim_a = math::CreateMatrixDescriptor(
+      math::RowMatrixFromVector(x->dims()), 0, param.transpose_X);
+  auto mat_dim_b = math::CreateMatrixDescriptor(
+      math::ColumnMatrixFromVector(y->dims()), 0, param.transpose_Y);
+  int lda = (mat_dim_a.trans_ ? mat_dim_a.height_ : mat_dim_a.width_);
+  int ldb = (mat_dim_b.trans_ ? mat_dim_b.height_ : mat_dim_b.width_);
+  int ldc = mat_dim_b.width_;
+
+  int r = 0;
+  if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
+    r = xdnn::fc_int16(ctx.GetRawContext(), /* context */
+                       mat_dim_a.trans_,    /* TransA */
+                       mat_dim_b.trans_,    /* TransB */
+                       mat_dim_a.height_,   /* m */
+                       mat_dim_b.width_,    /* n */
+                       mat_dim_a.width_,    /* k */
+                       param.alpha,         /* alpha */
+                       x->data<float>(),    /* A */
+                       y->data<float>(),    /* B */
+                       0.0f,                /* beta */
+                       out->mutable_data<float>(TARGET(kXPU)) /* C */);
+  } else {
+    // batch matmul
+    r = xdnn::gemm_strided_batched_int16<float, float, float>(
+        ctx.GetRawContext(),                    /* context */
+        mat_dim_a.trans_,                       /* TransA */
+        mat_dim_b.trans_,                       /* TransB */
+        mat_dim_a.batch_size_,                  /* batch_size */
+        mat_dim_a.height_,                      /* M */
+        mat_dim_b.width_,                       /* N */
+        mat_dim_a.width_,                       /* K */
+        param.alpha,                            /* alpha */
+        x->data<float>(),                       /* A */
+        lda,                                    /* lda */
+        mat_dim_a.stride_,                      /* stride_a */
+        y->data<float>(),                       /* B */
+        ldb,                                    /* ldb */
+        mat_dim_b.stride_,                      /* stride_b */
+        0.0f,                                   /* beta */
+        out->mutable_data<float>(TARGET(kXPU)), /* C */
+        ldc,                                    /* ldc */
+        mat_dim_a.height_ * mat_dim_b.width_ /* stride_c */);
+  }
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    matmul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MatMulCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/matmul_compute.h b/lite/kernels/xpu/matmul_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..aca3cbc603eff490ae19fd2546352adca3c1a7cf
--- /dev/null
+++ b/lite/kernels/xpu/matmul_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class MatMulCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatMulParam;
+
+  virtual void Run();
+
+  virtual ~MatMulCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/mul_compute.cc b/lite/kernels/xpu/mul_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8aa93a9c8b8d84874b95dae2c15bf985585c916c
--- /dev/null
+++ b/lite/kernels/xpu/mul_compute.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/mul_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void MulCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& origin_x = *param.x;
+  auto& origin_y = *param.y;
+  auto& x_dims = origin_x.dims();
+  auto& y_dims = origin_y.dims();
+  Tensor x_matrix, y_matrix;
+  if (x_dims.size() > 2) {
+    x_matrix = ReshapeToMatrix(origin_x, param.x_num_col_dims);
+  } else {
+    x_matrix = origin_x;
+  }
+  if (y_dims.size() > 2) {
+    y_matrix = ReshapeToMatrix(origin_y, param.y_num_col_dims);
+  } else {
+    y_matrix = origin_y;
+  }
+  int m = x_matrix.dims()[0];
+  int k = x_matrix.dims()[1];
+  int n = y_matrix.dims()[1];
+
+  int r =
+      xdnn::fc_int16(ctx.GetRawContext(), /* context */
+                     false,               /* TransA */
+                     false,               /* TransB */
+                     m,
+                     n,
+                     k,
+                     1.0f,                   /* alpha */
+                     x_matrix.data<float>(), /* A */
+                     y_matrix.data<float>(), /* B */
+                     0.0f,                   /* beta */
+                     param.output->mutable_data<float>(TARGET(kXPU)) /* C */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    mul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MulCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/mul_compute.h b/lite/kernels/xpu/mul_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb2778c0e73189b11135395b42655e0250bbfd0a
--- /dev/null
+++ b/lite/kernels/xpu/mul_compute.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+static inline lite::Tensor ReshapeToMatrix(const lite::Tensor& src,
+                                           int num_col_dims) {
+  int rank = src.dims().size();
+  if (rank == 2) {
+    return src;
+  }
+  lite::Tensor res;
+  res.ShareDataWith(src);
+  res.Resize(src.dims().Flatten2D(num_col_dims));
+  return res;
+}
+
+class MulCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulParam;
+
+  virtual void Run();
+
+  virtual ~MulCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/pool_compute.cc b/lite/kernels/xpu/pool_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4480e4875cb3317ddeeea7017f4aa825e2afe320
--- /dev/null
+++ b/lite/kernels/xpu/pool_compute.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/pool_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void Pool2DCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.x->dims();
+  CHECK_EQ(x_dims.size(), 4);
+  auto& o_dims = param.output->dims();
+  CHECK_EQ(param.ksize.size(), 2);
+  if (param.global_pooling) {
+    param.ksize[0] = x_dims[2];
+    param.ksize[1] = x_dims[3];
+  }
+  CHECK_EQ(param.strides.size(), 2);
+  CHECK_EQ(param.paddings->size(), 4);
+  auto& paddings = *param.paddings;
+  auto type = xdnn::MAX_WITHOUT_INDEX;
+  if (param.pooling_type == "avg") {
+    if (paddings[0] == 0 && paddings[1] == 0 && paddings[2] == 0 &&
+        paddings[3] == 0) {
+      type = xdnn::AVG_WITHOUT_PAD;
+    } else {
+      type = xdnn::AVG_WITH_PAD;
+    }
+  }
+
+  int r = xdnn::pooling_forward<float, float>(
+      ctx.GetRawContext(),                             /* context */
+      param.x->data<float>(),                          /* x */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* y */
+      nullptr,                                         /* y_index */
+      type,                                            /* type */
+      x_dims[0] * x_dims[1],                           /* c */
+      x_dims[2],                                       /* in_h */
+      x_dims[3],                                       /* in_w */
+      paddings[0],                                     /* pad_left */
+      paddings[1],                                     /* pad_right */
+      paddings[2],                                     /* pad_up */
+      paddings[3],                                     /* pad_down */
+      param.ksize[0],                                  /* win_h */
+      param.ksize[1],                                  /* win_w */
+      param.strides[0],                                /* stride_h */
+      param.strides[1],                                /* stride_w */
+      o_dims[2],                                       /* out_h */
+      o_dims[3] /* out_w */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    pool2d, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Pool2DCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/pool_compute.h b/lite/kernels/xpu/pool_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5648554c41c76396184b7dc536f8c8628cbf23e4
--- /dev/null
+++ b/lite/kernels/xpu/pool_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class Pool2DCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::PoolParam;
+
+  virtual void Run();
+
+  virtual ~Pool2DCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/scale_compute.cc b/lite/kernels/xpu/scale_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c8d3b0a238880402c09e014aeb91a898b252660
--- /dev/null
+++ b/lite/kernels/xpu/scale_compute.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/scale_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ScaleCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.x->dims();
+
+  int r = xdnn::scale(ctx.GetRawContext(),    /* context */
+                      x_dims.production(),    /* len */
+                      param.scale,            /* alpha */
+                      param.bias,             /* beta */
+                      param.bias_after_scale, /* bias_after_scale */
+                      param.x->data<float>(), /* x */
+                      param.output->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    scale, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ScaleCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/scale_compute.h b/lite/kernels/xpu/scale_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6989b0f0f31e54a63dac2f7c2090dc676e31acfb
--- /dev/null
+++ b/lite/kernels/xpu/scale_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class ScaleCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ScaleParam;
+
+  virtual void Run();
+
+  virtual ~ScaleCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/slice_compute.cc b/lite/kernels/xpu/slice_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5919f84dbd3f0923cc44f2ad4bee13d1bb13f98d
--- /dev/null
+++ b/lite/kernels/xpu/slice_compute.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/slice_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SliceCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.X->dims();
+  x_shape_.reserve(x_dims.size());
+  x_dim_begin_.reserve(x_dims.size());
+  x_dim_end_.reserve(x_dims.size());
+}
+
+void SliceCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto x_dims = param.X->dims();
+  for (size_t i = 0; i < x_dims.size(); ++i) {
+    x_shape_[i] = x_dims[i];
+    x_dim_begin_[i] = 0;
+    x_dim_end_[i] = x_dims[i];
+  }
+  for (size_t i = 0; i < param.axes.size(); ++i) {
+    int axis = param.axes[i];
+    x_dim_begin_[axis] = param.starts[i];
+    x_dim_end_[axis] = param.ends[i];
+  }
+
+  int ndim = param.X->dims().size();
+  int r = xdnn::slice_forward(
+      ctx.GetRawContext(),    /* context */
+      &x_shape_[0],           /* shape */
+      &x_dim_begin_[0],       /* starts */
+      &x_dim_end_[0],         /* ends */
+      ndim,                   /* n */
+      param.X->data<float>(), /* in */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    slice, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SliceCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/slice_compute.h b/lite/kernels/xpu/slice_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fb34e30c143d0890dc76e9b0fd3b2d1bfcef8e9
--- /dev/null
+++ b/lite/kernels/xpu/slice_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SliceCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SliceParam;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+  virtual ~SliceCompute() = default;
+
+ private:
+  std::vector<int> x_shape_;
+  std::vector<int> x_dim_begin_;
+  std::vector<int> x_dim_end_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/softmax_compute.cc b/lite/kernels/xpu/softmax_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e4a6c19f3bfc9ced852c5b6aa7f63e568bc7669
--- /dev/null
+++ b/lite/kernels/xpu/softmax_compute.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/softmax_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SoftmaxCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.x->dims();
+  int axis = CanonicalAxis(param.axis, x_dims.size());
+  int rows = SizeToAxis(axis, x_dims);
+  int cols = SizeFromAxis(axis, x_dims);
+
+  int r = xdnn::softmax2d_forward(
+      ctx.GetRawContext(),                             /* context */
+      param.x->data<float>(),                          /* x */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* y */
+      rows,                                            /* rows */
+      cols /* cols */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(softmax,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SoftmaxCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/softmax_compute.h b/lite/kernels/xpu/softmax_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..e807f38a2ea3c9645b78340ac4dc87d1984c40f7
--- /dev/null
+++ b/lite/kernels/xpu/softmax_compute.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+
+static inline int SizeToAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeFromAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (size_t i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+class SoftmaxCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SoftmaxParam;
+
+  virtual void Run();
+
+  virtual ~SoftmaxCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/stack_compute.cc b/lite/kernels/xpu/stack_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9e5c19d25135ac5877e38eaf65829fefc500e07
--- /dev/null
+++ b/lite/kernels/xpu/stack_compute.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/stack_compute.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void StackCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  int n = param.X.size();
+  void* x_ptr = nullptr;
+  xpu_malloc(&x_ptr, n * 8 /* sizeof(__global__ float*) */);
+  x_ptr_guard_.reset(x_ptr);
+  x_ptr_cpu_.reserve(n);
+}
+
+void StackCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int n = param.X.size();
+  auto x_dims = param.X[0]->dims();
+  int axis = param.axis;
+  // XXX(miaotianxiang): +1?
+  if (axis < 0) axis += (x_dims.size() + 1);
+  auto matrix = x_dims.Flatten2D(axis);
+  int height = matrix[0];
+  int width = matrix[1];
+
+  for (int i = 0; i < n; ++i) {
+    x_ptr_cpu_[i] = param.X[i]->data<float>();
+  }
+  xpu_memcpy(x_ptr_guard_.get(), &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE);
+
+  int r = xdnn::stack_forward(
+      ctx.GetRawContext(), /* context */
+      height,              /* height */
+      width,               /* width */
+      n,                   /* n */
+      x_ptr_guard_.get(),  /* x_ptr */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    stack, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::StackCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/stack_compute.h b/lite/kernels/xpu/stack_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f77cbb3a73bce2d5496f840b2a1f8e14313e776
--- /dev/null
+++ b/lite/kernels/xpu/stack_compute.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+struct XPUFreeDeleter {
+  void operator()(void* p) const { xpu_free(p); }
+};
+
+class StackCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::StackParam;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+  virtual ~StackCompute() = default;
+
+ private:
+  std::unique_ptr<void, XPUFreeDeleter> x_ptr_guard_;
+  std::vector<const float*> x_ptr_cpu_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc
index 1b6d374f7396cf1e4e91bfe786603005fb0ff8dc..9c2191331c85a7f99ffb5a2e9662ed5831cb1dda 100644
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -34,7 +34,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   subgraph::xpu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
   for (auto& inst : origin_program_) {
-    auto op = inst.op();
+    auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
     op->InferShape();
@@ -43,10 +43,8 @@ int SubgraphEngine::BuildDeviceProgram() {
       return subgraph::FAILED;
     }
     auto kernel = inst.kernel();
-    status |=
-        bridges.Select(op_type, TARGET(kXPU))(reinterpret_cast<void*>(&graph),
-                                              const_cast<OpLite*>(op),
-                                              const_cast<KernelBase*>(kernel));
+    status |= bridges.Select(op_type, TARGET(kXPU))(
+        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
       return subgraph::FAILED;
     }
diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h
index 1faada3978a2ab33fbe0135d57f21a94c97d5c61..601c8821bc826e350c233573bf7eff89cdf5c1f5 100644
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/kernel.h"
 #include "lite/kernels/npu/bridges/engine.h"
 #include "lite/kernels/npu/bridges/registry.h"
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index 08e6a303094dc42278bfcb24c54f16bd3819d5c1..8bf3f87c613de261b5c4da9a1ab55c4378082864 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -253,7 +253,7 @@ void LoadModelPb(const std::string &model_dir,
       std::string file_path = model_dir + "/" + var.name();
       VLOG(4) << "reading weight " << var.name();
 
-      std::ifstream file(file_path);
+      std::ifstream file(file_path, std::ios::binary);
       switch (var.type().type()) {
         case framework::proto::VarType_Type_LOD_TENSOR:
           LoadLoDTensor(file, scope->Var(var.name()));
@@ -382,7 +382,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
     pb_dims->Resize(static_cast<int>(dims.size()), 0);
     auto dims_vec = dims.Vectorize();
     std::copy(dims_vec.begin(), dims_vec.end(), pb_dims->begin());
-    int32_t size = desc.ByteSize();
+    int32_t size = desc.ByteSizeLong();
     os.write(reinterpret_cast<const char *>(&size), sizeof(size));
     auto out = desc.SerializeAsString();
     os.write(out.data(), size);
diff --git a/lite/model_parser/naive_buffer/naive_buffer.h b/lite/model_parser/naive_buffer/naive_buffer.h
index 5be17856a25aabfed81ae88d80e788c8dd2be4bc..5fd1b59e151cf834f87aa4e505be029b2b0d899a 100644
--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ b/lite/model_parser/naive_buffer/naive_buffer.h
@@ -192,7 +192,7 @@ class EnumBuilder : public FieldBuilder {
 
   ~EnumBuilder() = default;
 
-  Type type() const override { return Type::_enum; }
+  Type type() const override { return Type::ENUM; }
 };
 
 class StringBuilder : public FieldBuilder {
@@ -211,7 +211,7 @@ class StringBuilder : public FieldBuilder {
 
   void Load() override;
 
-  Type type() const override { return Type::_string; }
+  Type type() const override { return Type::STRING; }
 };
 
 /*
@@ -266,7 +266,7 @@ class StructBuilder : public FieldBuilder {
 
   /// Type of this struct.
   // TODO(Superjomn) The customized type is not supported yet.
-  Type type() const override { return Type::_unk; }
+  Type type() const override { return Type::UNK; }
 
   /// Get a field by `name`.
   template <typename T>
@@ -327,7 +327,7 @@ class ListBuilder : public FieldBuilder {
   }
 
   // Get element type.
-  Type type() const override { return Type::_list; }
+  Type type() const override { return Type::LIST; }
 
   /// Persist information to the corresponding BinaryTable.
   void Save() override;
diff --git a/lite/model_parser/naive_buffer/naive_buffer_test.cc b/lite/model_parser/naive_buffer/naive_buffer_test.cc
index 98789e8006817fceb4745bffd0c095da7ad360fc..7356c6213c3b1c85e63fed604f22652d780a369f 100644
--- a/lite/model_parser/naive_buffer/naive_buffer_test.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer_test.cc
@@ -24,9 +24,9 @@ TEST(NaiveBuffer, primary) {
   PrimaryBuilder<int32_t> p0(&table);
   PrimaryBuilder<float> p1(&table);
   StringBuilder p2(&table);
-  ASSERT_EQ(p0.type(), Type::_int32);
-  ASSERT_EQ(p1.type(), Type::_float32);
-  ASSERT_EQ(p2.type(), Type::_string);
+  ASSERT_EQ(p0.type(), Type::INT32);
+  ASSERT_EQ(p1.type(), Type::FLOAT32);
+  ASSERT_EQ(p2.type(), Type::STRING);
 
   p0.set(2008);
   p0.Save();
@@ -129,7 +129,7 @@ TEST(NBTestMsg, msg1) {
   int0->set(2008);
   int0->Save();
 
-  enum0->set(Type::_int64);
+  enum0->set(Type::INT64);
   enum0->Save();
 
   SetMsg0(msg0);
@@ -143,7 +143,7 @@ TEST(NBTestMsg, msg1) {
   msg1.Load();
 
   ASSERT_EQ(msg.GetField<Int32Builder>("int0").data(), 2008);
-  ASSERT_EQ(msg.GetField<enum_builder>("enum0").data(), Type::_int64);
+  ASSERT_EQ(msg.GetField<enum_builder>("enum0").data(), Type::INT64);
   TestMsg0(msg1.GetField<NBTestMsg0>("msg0"));
 }
 
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 48e27560317c089446e8dbc5040786f34ca962c4..c7fa674bff745df29b271e10c8c4d99687a889ed 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -14,7 +14,7 @@ add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} )
 add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS})
 add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS})
 add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS})
-add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
+add_operator(activation_basic_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
 add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS})
 add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS})
 add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
@@ -60,6 +60,7 @@ add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS})
 add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS})
 
 # 3.extra ops
+add_operator(activation_extra_ops extra SRCS activation_extra_ops.cc DEPS ${op_DEPS})
 add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS})
 add_operator(lrn_op_lite extra SRCS lrn_op.cc DEPS ${op_DEPS})
 add_operator(decode_bboxes_op_lite extra SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
@@ -73,6 +74,7 @@ add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS})
 add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS})
 add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS})
+add_operator(sequence_unpad_op_lite extra SRCS sequence_unpad_op.cc DEPS ${op_DEPS})
 add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
 add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS})
 add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS})
@@ -105,6 +107,7 @@ add_operator(conditional_block_op_lite extra SRCS conditional_block_op.cc DEPS $
 add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.cc DEPS ${op_DEPS})
 add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS})
 add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
+add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
@@ -141,13 +144,16 @@ add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS})
 
 # 4. training op
 add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS})
-if (LITE_WITH_TRAIN)
-  add_operator(mean_grad_op extra SRCS mean_grad_op.cc DEPS ${op_DEPS})
-  add_operator(activation_grad_ops basic SRCS activation_grad_ops.cc DEPS ${op_DEPS})
-  add_operator(elementwise_grad_op extra SRCS elementwise_grad_ops.cc DEPS ${op_DEPS})
-  add_operator(mul_grad_op basic SRCS mul_grad_op.cc DEPS ${op_DEPS})
-  add_operator(sgd_op extra SRCS sgd_op.cc DEPS ${op_DEPS})
-endif()
+
+add_operator(mean_grad_op train SRCS mean_grad_op.cc DEPS ${op_DEPS})
+add_operator(activation_grad_ops train SRCS activation_grad_ops.cc DEPS ${op_DEPS})
+add_operator(elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS ${op_DEPS})
+add_operator(mul_grad_op train SRCS mul_grad_op.cc DEPS ${op_DEPS})
+add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS})
+
+# Only for XPU
+add_operator(__xpu__resnet50_op extra SRCS __xpu__resnet50_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS ${op_DEPS})
 
 if (NOT LITE_WITH_X86)
     lite_cc_test(test_fc_op SRCS fc_op_test.cc
diff --git a/lite/operators/__xpu__multi_encoder_op.cc b/lite/operators/__xpu__multi_encoder_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d8aca942592668831b8d46d3e07ce83a57f1011
--- /dev/null
+++ b/lite/operators/__xpu__multi_encoder_op.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__multi_encoder_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUMultiEncoderOp::CheckShape() const { return true; }
+
+bool XPUMultiEncoderOp::InferShapeImpl() const {
+  auto input_shape = param_.input->dims();
+  param_.output->Resize(input_shape);
+  return true;
+}
+
+bool XPUMultiEncoderOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                   lite::Scope* scope) {
+  param_.input = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
+  param_.mask = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("Mask").front())->Get<lite::Tensor>());
+  param_.fc_weight_max = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("FCWeightMax").front())
+           ->Get<lite::Tensor>());
+  param_.output = scope->FindVar(op_desc.Output("Output").front())
+                      ->GetMutable<lite::Tensor>();
+
+  param_.fc_weight.clear();
+  for (auto& name : op_desc.Input("FCWeight")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.fc_weight.push_back(t);
+  }
+  param_.fc_bias.clear();
+  for (auto& name : op_desc.Input("FCBias")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.fc_bias.push_back(t);
+  }
+  param_.ln_scale.clear();
+  for (auto& name : op_desc.Input("LNScale")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.ln_scale.push_back(t);
+  }
+  param_.ln_bias.clear();
+  for (auto& name : op_desc.Input("LNBias")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.ln_bias.push_back(t);
+  }
+
+  param_.n_layers = op_desc.GetAttr<int>("n_layers");
+  param_.head_num = op_desc.GetAttr<int>("head_num");
+  param_.size_per_head = op_desc.GetAttr<int>("size_per_head");
+  param_.act_type = op_desc.GetAttr<std::string>("act_type");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__multi_encoder,
+                 paddle::lite::operators::XPUMultiEncoderOp);
diff --git a/lite/operators/__xpu__multi_encoder_op.h b/lite/operators/__xpu__multi_encoder_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c20562151ad751f3a8c72ce9ce262cf1f0a286a
--- /dev/null
+++ b/lite/operators/__xpu__multi_encoder_op.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUMultiEncoderOp : public OpLite {
+ public:
+  XPUMultiEncoderOp() {}
+  explicit XPUMultiEncoderOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "MultiEncoder"; }
+
+ private:
+  mutable XPUMultiEncoderParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/__xpu__resnet50_op.cc b/lite/operators/__xpu__resnet50_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02ea6dc1799baaab486b839a4d3137020a9f7a5c
--- /dev/null
+++ b/lite/operators/__xpu__resnet50_op.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__resnet50_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUResNet50Op::CheckShape() const { return true; }
+
+bool XPUResNet50Op::InferShapeImpl() const {
+  auto input_shape = param_.input->dims();
+  input_shape[1] = 2048;
+  input_shape[2] = 1;
+  input_shape[3] = 1;
+  param_.output->Resize(input_shape);
+  return true;
+}
+
+bool XPUResNet50Op::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  param_.input = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
+  param_.output = scope->FindVar(op_desc.Output("Output").front())
+                      ->GetMutable<lite::Tensor>();
+
+  param_.filter.clear();
+  for (auto& name : op_desc.Input("Filter")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.filter.push_back(t);
+  }
+  param_.bias.clear();
+  for (auto& name : op_desc.Input("Bias")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.bias.push_back(t);
+  }
+  param_.max_filter.clear();
+  for (auto& name : op_desc.Input("MaxFilter")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.max_filter.push_back(t);
+  }
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__resnet50, paddle::lite::operators::XPUResNet50Op);
diff --git a/lite/operators/__xpu__resnet50_op.h b/lite/operators/__xpu__resnet50_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..97f4d42006c64243818af21aa26f708d7889ba96
--- /dev/null
+++ b/lite/operators/__xpu__resnet50_op.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUResNet50Op : public OpLite {
+ public:
+  XPUResNet50Op() {}
+  explicit XPUResNet50Op(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "ResNet50"; }
+
+ private:
+  mutable XPUResNet50Param param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/activation_extra_ops.cc b/lite/operators/activation_extra_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c773b4327abd48532a1bc9283963bd0dad19da6
--- /dev/null
+++ b/lite/operators/activation_extra_ops.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.i
+
+#include "lite/core/op_registry.h"
+#include "lite/operators/activation_ops.h"
+
+// Extra activation ops
+REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(relu_clipped, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(hard_swish, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(reciprocal, paddle::lite::operators::ActivationOp);
diff --git a/lite/operators/activation_grad_ops.cc b/lite/operators/activation_grad_ops.cc
index 9a37a5f0a178192ead00801632914a8f446f058f..b31163e5dce6d9b77d923ba44ed58952263610a5 100644
--- a/lite/operators/activation_grad_ops.cc
+++ b/lite/operators/activation_grad_ops.cc
@@ -25,7 +25,7 @@ bool ActivationGradOp::CheckShape() const {
   return true;
 }
 
-bool ActivationGradOp::InferShape() const {
+bool ActivationGradOp::InferShapeImpl() const {
   param_.X_grad->Resize(param_.Out_grad->dims());
   return true;
 }
diff --git a/lite/operators/activation_grad_ops.h b/lite/operators/activation_grad_ops.h
index 5421b3247ff844e20931a6a15b85eb7da85e7f69..cf928cfe1bf9945a1dd0474408472759a499b5d7 100644
--- a/lite/operators/activation_grad_ops.h
+++ b/lite/operators/activation_grad_ops.h
@@ -26,7 +26,7 @@ class ActivationGradOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc
index f7a326358bb30d747c949d7bacdebb47846562b5..a3d9895955d99b96609a8c35e2493b17a11b9181 100644
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -25,7 +25,7 @@ bool ActivationOp::CheckShape() const {
   return true;
 }
 
-bool ActivationOp::InferShape() const {
+bool ActivationOp::InferShapeImpl() const {
   param_.Out->Resize(param_.X->dims());
   auto out_lod = param_.Out->mutable_lod();
   *out_lod = param_.X->lod();
@@ -71,6 +71,17 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   } else if (opdesc.Type() == "exp") {
     // exp
     param_.active_type = lite_api::ActivationType::kExp;
+  } else if (opdesc.Type() == "abs") {
+    // abs
+    param_.active_type = lite_api::ActivationType::kAbs;
+  } else if (opdesc.Type() == "hard_swish") {
+    // hard_swish
+    param_.active_type = lite_api::ActivationType::kHardSwish;
+    param_.hard_swish_threshold = opdesc.GetAttr<float>("threshold");
+    param_.hard_swish_scale = opdesc.GetAttr<float>("scale");
+    param_.hard_swish_offset = opdesc.GetAttr<float>("offset");
+  } else if (opdesc.Type() == "reciprocal") {
+    param_.active_type = lite_api::ActivationType::kReciprocal;
   }
   VLOG(4) << "opdesc.Type():" << opdesc.Type();
 
@@ -81,20 +92,11 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
-REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(relu_clipped, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp);
+
+// Baisc activation ops
 REGISTER_LITE_OP(sigmoid, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(tanh, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp);
diff --git a/lite/operators/activation_ops.h b/lite/operators/activation_ops.h
index 34099ab0fdb422f523e383dc0dd286acf24b2731..8f81b12af03052e558e7faa2e813039d4dee8988 100644
--- a/lite/operators/activation_ops.h
+++ b/lite/operators/activation_ops.h
@@ -26,7 +26,7 @@ class ActivationOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
diff --git a/lite/operators/affine_channel_op.cc b/lite/operators/affine_channel_op.cc
index c4945ababd2fdf3b0f1b25d26eb0f66c8f613b21..447079deb33bdb893b99901d8559d6961489789d 100644
--- a/lite/operators/affine_channel_op.cc
+++ b/lite/operators/affine_channel_op.cc
@@ -44,7 +44,7 @@ bool AffineChannelOpLite::CheckShape() const {
   return true;
 }
 
-bool AffineChannelOpLite::InferShape() const {
+bool AffineChannelOpLite::InferShapeImpl() const {
   const auto x_dims = param_.X->dims();
   param_.Out->Resize(x_dims);
   return true;
diff --git a/lite/operators/affine_channel_op.h b/lite/operators/affine_channel_op.h
index 85a043bdc8e1c6f41c27b2e57555d3454322f789..5a3d9d66259d477d42ac00e0e1b1a7ba1bf2e862 100644
--- a/lite/operators/affine_channel_op.h
+++ b/lite/operators/affine_channel_op.h
@@ -31,7 +31,7 @@ class AffineChannelOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/anchor_generator_op.cc b/lite/operators/anchor_generator_op.cc
index 8daa54905fcf7cf52259840c26198721d6b8f0fa..e57a4b2df8c75afd28506b5e0e2f7b7aa142b838 100644
--- a/lite/operators/anchor_generator_op.cc
+++ b/lite/operators/anchor_generator_op.cc
@@ -31,7 +31,7 @@ bool AnchorGeneratorOpLite::CheckShape() const {
   return true;
 }
 
-bool AnchorGeneratorOpLite::InferShape() const {
+bool AnchorGeneratorOpLite::InferShapeImpl() const {
   auto input_dims = param_.Input->dims();
   size_t num_anchors = param_.aspect_ratios.size() * param_.anchor_sizes.size();
   std::vector<int64_t> output_shape(
diff --git a/lite/operators/anchor_generator_op.h b/lite/operators/anchor_generator_op.h
index 46e5e0fac243c10b62122327ef06ea166878e54f..2ff3422824c15b54ed1fa3ca9952745d5b1706ac 100644
--- a/lite/operators/anchor_generator_op.h
+++ b/lite/operators/anchor_generator_op.h
@@ -32,7 +32,7 @@ class AnchorGeneratorOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/argmax_op.cc b/lite/operators/argmax_op.cc
index 772cc446077e5e896b757051fae9f9b8f59df1d8..b733998ae57785483f539b56dcb47b7b50f04cf0 100644
--- a/lite/operators/argmax_op.cc
+++ b/lite/operators/argmax_op.cc
@@ -29,7 +29,7 @@ bool ArgmaxOpLite::CheckShape() const {
   return true;
 }
 
-bool ArgmaxOpLite::InferShape() const {
+bool ArgmaxOpLite::InferShapeImpl() const {
   auto x_dims = param_.X->dims();
   int x_rank = x_dims.size();
   int axis = param_.Axis;
diff --git a/lite/operators/argmax_op.h b/lite/operators/argmax_op.h
index a5accc97e3b9f3bb2fbd00f45fd3a45063e5c747..e6944507cf9f6ded86ccbae7c3cec79106e8ba98 100644
--- a/lite/operators/argmax_op.h
+++ b/lite/operators/argmax_op.h
@@ -31,7 +31,7 @@ class ArgmaxOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/assign_op.cc b/lite/operators/assign_op.cc
index 8510b7e8b7b8a5732e0e09d3db494ab3eb9f15a8..25e8539d2e55a07a19d707713489d86f84aa64db 100644
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
@@ -26,7 +26,7 @@ bool AssignOpLite::CheckShape() const {
   return true;
 }
 
-bool AssignOpLite::InferShape() const {
+bool AssignOpLite::InferShapeImpl() const {
   lite::DDim input_dims;
   input_dims = param_.X->dims();
   param_.Out->Resize(lite::DDim(input_dims));
diff --git a/lite/operators/assign_op.h b/lite/operators/assign_op.h
index 555356c3659ff31c84b2630c1f5da6acab003823..9e7039bb5b0088a6bda6acbf2baf7a50444df8b2 100644
--- a/lite/operators/assign_op.h
+++ b/lite/operators/assign_op.h
@@ -30,7 +30,7 @@ class AssignOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/assign_value_op.cc b/lite/operators/assign_value_op.cc
index 046c5222283fc73bd3af1e53520b1fc5539bcd31..ff5b55735f7b58aa2eaa2274574336dadd8061e6 100644
--- a/lite/operators/assign_value_op.cc
+++ b/lite/operators/assign_value_op.cc
@@ -35,7 +35,7 @@ bool AssignValueOpLite::CheckShape() const {
   return true;
 }
 
-bool AssignValueOpLite::InferShape() const {
+bool AssignValueOpLite::InferShapeImpl() const {
   std::vector<int> shape = param_.shape;
   std::vector<int64_t> out_shape;
   for (size_t i = 0; i < shape.size(); i++) out_shape.push_back(shape[i]);
diff --git a/lite/operators/assign_value_op.h b/lite/operators/assign_value_op.h
index 7bf220615935f02051ed606adb894bf9842378f3..030da048184c9862b76f59198574b394457768d5 100644
--- a/lite/operators/assign_value_op.h
+++ b/lite/operators/assign_value_op.h
@@ -31,7 +31,7 @@ class AssignValueOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/attention_padding_mask_op.cc b/lite/operators/attention_padding_mask_op.cc
index a88df0e7a902c6cac63eb77377bb0b49ee30c9b3..2f3a0cd265c56ac24548e23ff3daf09e27e1d800 100644
--- a/lite/operators/attention_padding_mask_op.cc
+++ b/lite/operators/attention_padding_mask_op.cc
@@ -28,7 +28,7 @@ bool AttentionPaddingMaskOp::CheckShape() const {
   return true;
 }
 
-bool AttentionPaddingMaskOp::InferShape() const {
+bool AttentionPaddingMaskOp::InferShapeImpl() const {
   auto src_len = param_.X->lod()[0][1];
   CHECK_EQ(src_len, param_.X->dims()[1])
       << "Mismatch source length, expect: " << src_len
diff --git a/lite/operators/attention_padding_mask_op.h b/lite/operators/attention_padding_mask_op.h
index 894d68f6226720139aee07274d4ac5cf660749f1..6a2443fc6749d4f2066ee761fd194441e2fe46cd 100644
--- a/lite/operators/attention_padding_mask_op.h
+++ b/lite/operators/attention_padding_mask_op.h
@@ -29,7 +29,7 @@ class AttentionPaddingMaskOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/axpy_op.cc b/lite/operators/axpy_op.cc
index 60f302862afa47ca75ae703e7b848bb3a0e7604c..c1c6304c3119f89bdc46400b2478a767c914d001 100644
--- a/lite/operators/axpy_op.cc
+++ b/lite/operators/axpy_op.cc
@@ -34,7 +34,7 @@ bool AxpyOpLite::CheckShape() const {
   return true;
 }
 
-bool AxpyOpLite::InferShape() const {
+bool AxpyOpLite::InferShapeImpl() const {
   auto dims = param_.Bias->dims();
 
   // Set output dims
diff --git a/lite/operators/axpy_op.h b/lite/operators/axpy_op.h
index 1fa8540743f65db864f33633003b4ed8f6d8cb92..e9d9f44ca5f5843628af998d9140519a3f3a1c29 100644
--- a/lite/operators/axpy_op.h
+++ b/lite/operators/axpy_op.h
@@ -31,7 +31,7 @@ class AxpyOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/batch_norm_op.cc b/lite/operators/batch_norm_op.cc
index eca7fa6001dda7835213c60be1d21eedff301ae4..67e037fba349e811f1faf991c84310b11ab7a13c 100644
--- a/lite/operators/batch_norm_op.cc
+++ b/lite/operators/batch_norm_op.cc
@@ -46,7 +46,7 @@ bool BatchNormOp::CheckShape() const {
   return true;
 }
 
-bool BatchNormOp::InferShape() const {
+bool BatchNormOp::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
   int64_t channel_size = 0;
   switch (param_.data_layout) {
diff --git a/lite/operators/batch_norm_op.h b/lite/operators/batch_norm_op.h
index 21dbf9a28a4257acdd80ac6c49d111cdd757b65d..9598763713564192ed4ad0c99200f0fdb1d88d37 100644
--- a/lite/operators/batch_norm_op.h
+++ b/lite/operators/batch_norm_op.h
@@ -30,7 +30,7 @@ class BatchNormOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/beam_search_decode_op.cc b/lite/operators/beam_search_decode_op.cc
index 52888d8a99c0f6507862f515c633f04d4fe09c39..444c9d6a11217c3134c3cb1f988c60c4b98d4566 100644
--- a/lite/operators/beam_search_decode_op.cc
+++ b/lite/operators/beam_search_decode_op.cc
@@ -28,7 +28,7 @@ bool BeamSearchDecodeOpLite::CheckShape() const {
   return true;
 }
 
-bool BeamSearchDecodeOpLite::InferShape() const { return true; }
+bool BeamSearchDecodeOpLite::InferShapeImpl() const { return true; }
 
 bool BeamSearchDecodeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
                                         lite::Scope *scope) {
diff --git a/lite/operators/beam_search_decode_op.h b/lite/operators/beam_search_decode_op.h
index 9d324d2bf0974fe5b65711c4ab2dacaf0d0d65d9..38bf9929ab12ba764fcd3fe6cacc7c08f35c15ca 100644
--- a/lite/operators/beam_search_decode_op.h
+++ b/lite/operators/beam_search_decode_op.h
@@ -31,7 +31,7 @@ class BeamSearchDecodeOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/beam_search_op.cc b/lite/operators/beam_search_op.cc
index c998e002ee3d6b8f3196fdfa212462dac4da0969..ea777ad53395aba1c7d6c21b07013e374b03c1f4 100644
--- a/lite/operators/beam_search_op.cc
+++ b/lite/operators/beam_search_op.cc
@@ -30,7 +30,7 @@ bool BeamSearchOp::CheckShape() const {
   return true;
 }
 
-bool BeamSearchOp::InferShape() const { return true; }
+bool BeamSearchOp::InferShapeImpl() const { return true; }
 
 bool BeamSearchOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.pre_ids = scope->FindTensor(opdesc.Input("pre_ids").front());
diff --git a/lite/operators/beam_search_op.h b/lite/operators/beam_search_op.h
index 42a6058de112215f525b51bfff6ff16aae04391d..7e325cb55668a77cf09466e86be220218a49cbee 100644
--- a/lite/operators/beam_search_op.h
+++ b/lite/operators/beam_search_op.h
@@ -30,7 +30,7 @@ class BeamSearchOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/box_clip_op.cc b/lite/operators/box_clip_op.cc
index 6bd93c6ea4e2efc93fdc7e64f1738c2ac3d40997..08ba49bd9ada076c6650249f67af15174491f634 100644
--- a/lite/operators/box_clip_op.cc
+++ b/lite/operators/box_clip_op.cc
@@ -35,7 +35,7 @@ bool BoxClipOpLite::CheckShape() const {
   return true;
 }
 
-bool BoxClipOpLite::InferShape() const {
+bool BoxClipOpLite::InferShapeImpl() const {
   auto* input = param_.Input;
   auto* output = param_.Output;
   output->Resize(input->dims());
diff --git a/lite/operators/box_clip_op.h b/lite/operators/box_clip_op.h
index c7e07b1015c52eb5711638163bda327c11152dd0..0aae2112ec8b91ba63205fadd4123bc3c5fce2fd 100644
--- a/lite/operators/box_clip_op.h
+++ b/lite/operators/box_clip_op.h
@@ -31,7 +31,7 @@ class BoxClipOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/box_coder_op.cc b/lite/operators/box_coder_op.cc
index c86f494fc4f96f688c30027f1d6aa1ee452da8f0..3133176b35ecae49ed9171ef6e8b519c6774ce5d 100644
--- a/lite/operators/box_coder_op.cc
+++ b/lite/operators/box_coder_op.cc
@@ -35,7 +35,7 @@ bool BoxCoderOpLite::CheckShape() const {
   return true;
 }
 
-bool BoxCoderOpLite::InferShape() const {
+bool BoxCoderOpLite::InferShapeImpl() const {
   auto prior_box_dims = param_.prior_box->dims();
   auto target_box_dims = param_.target_box->dims();
   std::string code_type = param_.code_type;
diff --git a/lite/operators/box_coder_op.h b/lite/operators/box_coder_op.h
index 61d54fd484ff377763e00f1d71bff1c0c6f89398..51e86423e39786426d53fe8ced861866bfeb1053 100644
--- a/lite/operators/box_coder_op.h
+++ b/lite/operators/box_coder_op.h
@@ -29,7 +29,7 @@ class BoxCoderOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/calib_op.cc b/lite/operators/calib_op.cc
index da00f01c3206c81fb89749432383ea8d99c14dc1..ce45fa1409b83e922fb132e79562bfba23a19414 100644
--- a/lite/operators/calib_op.cc
+++ b/lite/operators/calib_op.cc
@@ -24,8 +24,9 @@ bool CalibOpLite::CheckShape() const {
   CHECK_OR_FALSE(param_.output);
   return true;
 }
-bool CalibOpLite::InferShape() const {
+bool CalibOpLite::InferShapeImpl() const {
   param_.output->Resize(param_.input->dims());
+  param_.output->set_lod(param_.input->lod());
   return true;
 }
 
diff --git a/lite/operators/calib_op.h b/lite/operators/calib_op.h
index d575766c10d1e6cd66bf7f8117315ffe21fe10fe..94240880f55e782f025fe5777eba19e0c96cfbee 100644
--- a/lite/operators/calib_op.h
+++ b/lite/operators/calib_op.h
@@ -42,7 +42,7 @@ class CalibOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope);
 
diff --git a/lite/operators/cast_op.cc b/lite/operators/cast_op.cc
index 9ece0a45a3e997e4d1663755f42f6b42efb86c5d..da12e2afded2c23565080b06409ce35b0535c4ff 100644
--- a/lite/operators/cast_op.cc
+++ b/lite/operators/cast_op.cc
@@ -25,7 +25,7 @@ bool CastOp::CheckShape() const {
   return true;
 }
 
-bool CastOp::InferShape() const {
+bool CastOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto out_dims = param_.X->dims();
diff --git a/lite/operators/cast_op.h b/lite/operators/cast_op.h
index 2f5f57f12740d085bda36141299cfbe7c798c378..e045ef89f73d0ac29b0f03e148ad651c1513668f 100644
--- a/lite/operators/cast_op.h
+++ b/lite/operators/cast_op.h
@@ -30,7 +30,7 @@ class CastOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/collect_fpn_proposals_op.cc b/lite/operators/collect_fpn_proposals_op.cc
index 4731d4bf81c241c6733b1403699874c1053d2b7f..27dd9a50b6fb0a9943b7a9d86be390cbc6d406b0 100644
--- a/lite/operators/collect_fpn_proposals_op.cc
+++ b/lite/operators/collect_fpn_proposals_op.cc
@@ -43,7 +43,7 @@ bool CollectFpnProposalsOpLite::CheckShape() const {
   return true;
 }
 
-bool CollectFpnProposalsOpLite::InferShape() const {
+bool CollectFpnProposalsOpLite::InferShapeImpl() const {
   param_.fpn_rois->Resize({param_.post_nms_topN, 4});
 
   return true;
diff --git a/lite/operators/collect_fpn_proposals_op.h b/lite/operators/collect_fpn_proposals_op.h
index 1ae7bb269ff53bb8add92d9afc8d462c45cb5f0b..b3104e81d5ff8d82083a7b37ffd88dd169b840c9 100644
--- a/lite/operators/collect_fpn_proposals_op.h
+++ b/lite/operators/collect_fpn_proposals_op.h
@@ -32,7 +32,7 @@ class CollectFpnProposalsOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/compare_op.cc b/lite/operators/compare_op.cc
index aa500ba35c37cf8af17091d8d37d8fd8d1a08e0e..f458eae71edea6086e8947ae8881f6f218e49808 100644
--- a/lite/operators/compare_op.cc
+++ b/lite/operators/compare_op.cc
@@ -26,7 +26,7 @@ bool CompareOp::CheckShape() const {
   return true;
 }
 
-bool CompareOp::InferShape() const {
+bool CompareOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto input_dims = param_.X->dims();
diff --git a/lite/operators/compare_op.h b/lite/operators/compare_op.h
index 7ca21caaa1347f248213b2b43293ca18d514ba9a..c94cf88516af7676f8e524c091713cbaa4dd70ff 100644
--- a/lite/operators/compare_op.h
+++ b/lite/operators/compare_op.h
@@ -30,7 +30,7 @@ class CompareOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/concat_op.cc b/lite/operators/concat_op.cc
index b2f7438b64aa34787896839f020f0b056e6453fb..c15bf292897006b3c6d5e67bcfaea5d0e590a82d 100644
--- a/lite/operators/concat_op.cc
+++ b/lite/operators/concat_op.cc
@@ -26,7 +26,7 @@ bool ConcatOpLite::CheckShape() const {
   return true;
 }
 
-bool ConcatOpLite::InferShape() const {
+bool ConcatOpLite::InferShapeImpl() const {
   const std::vector<Tensor *> &inputs = param_.x;
   const size_t n = inputs.size();
   CHECK_GT_OR_FALSE(n, 0);
diff --git a/lite/operators/concat_op.h b/lite/operators/concat_op.h
index acc41de9b36cf6a808788a4f585e8a9c7f049717..2ac1572c833db217546aaa176640cb5c1022d3bf 100644
--- a/lite/operators/concat_op.h
+++ b/lite/operators/concat_op.h
@@ -30,7 +30,7 @@ class ConcatOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/conditional_block_op.cc b/lite/operators/conditional_block_op.cc
index c79c4e20a29834e858bc670104e2a09e55888c85..e3678e92c9d33be5428c82331ce963f4c6067369 100644
--- a/lite/operators/conditional_block_op.cc
+++ b/lite/operators/conditional_block_op.cc
@@ -27,7 +27,7 @@ bool ConditionalBlockOpLite::CheckShape() const {
   return true;
 }
 
-bool ConditionalBlockOpLite::InferShape() const { return true; }
+bool ConditionalBlockOpLite::InferShapeImpl() const { return true; }
 
 bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc,
                                         lite::Scope *scope) {
diff --git a/lite/operators/conditional_block_op.h b/lite/operators/conditional_block_op.h
index 5518c255c5799aa5b44557a4493275794fd598f5..1815731c8df3ac07bee80aa8e0cc658e752b5c4f 100644
--- a/lite/operators/conditional_block_op.h
+++ b/lite/operators/conditional_block_op.h
@@ -31,7 +31,7 @@ class ConditionalBlockOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
index 70ad3a32a83003e449524205a71dcc7536b9a11e..38c59a0290b03031e9cbe013a4a10c14c7ad1743 100644
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -80,35 +80,7 @@ void UpdatePaddingAndDilation(std::vector<int>* paddings,
   }
 }
 
-bool ConvOpLite::SmartInferShape() {
-  if (!last_input_shapes.empty()) {
-    if (last_input_shapes[0] == param_.x->dims() &&
-        last_input_lods[0] == param_.x->lod()) {
-      param_.output->Resize(last_output_shapes[0]);
-      param_.output->set_lod(last_output_lods[0]);
-      return true;
-    }
-  }
-
-  this->InferShape();
-
-  if (!last_input_shapes.empty()) {
-    last_input_shapes.clear();
-    last_input_lods.clear();
-  }
-  last_input_shapes.push_back(param_.x->dims());
-  last_input_lods.push_back(param_.x->lod());
-
-  if (!last_output_shapes.empty()) {
-    last_output_shapes.clear();
-    last_output_lods.clear();
-  }
-  last_output_shapes.push_back(param_.output->dims());
-  last_output_lods.push_back(param_.output->lod());
-
-  return true;
-}
-bool ConvOpLite::InferShape() const {
+bool ConvOpLite::InferShapeImpl() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
 
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index 3379fb409529e261f4af38ef2ee3483f17cc8a3b..eab17fe6db0a59a9eb0eea0ab7344758a8232d15 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -34,9 +34,7 @@ class ConvOpLite : public OpLite {
   explicit ConvOpLite(const std::string& type) : OpLite(type) {}
 
   bool CheckShape() const override;
-
-  bool InferShape() const override;
-  bool SmartInferShape() override;
+  bool InferShapeImpl() const override;
 
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override {
diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc
index a84b975492040ec0bdc1326f33f8b7edafdea2bb..9d098eb975ef071a4650ea547d6081d950b251f1 100644
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
@@ -52,7 +52,7 @@ inline int ConvTransposeOutputSize(int input_size,
   return output_size;
 }
 
-bool ConvTransposeOpLite::InferShape() const {
+bool ConvTransposeOpLite::InferShapeImpl() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
 
@@ -157,3 +157,5 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc& op_desc,
 
 REGISTER_LITE_OP(conv2d_transpose,
                  paddle::lite::operators::ConvTransposeOpLite);
+REGISTER_LITE_OP(depthwise_conv2d_transpose,
+                 paddle::lite::operators::ConvTransposeOpLite);
diff --git a/lite/operators/conv_transpose_op.h b/lite/operators/conv_transpose_op.h
index fb25c022f974ad195bf72b19cb9b459b2d11d5f2..891ece4f052128c8c236db5650414d6015ea9565 100644
--- a/lite/operators/conv_transpose_op.h
+++ b/lite/operators/conv_transpose_op.h
@@ -34,7 +34,7 @@ class ConvTransposeOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/crf_decoding_op.cc b/lite/operators/crf_decoding_op.cc
index 1b0a27ab4afdfc165dedc2ccfad492658ec40399..b1af573518bc483b6eaf5e013609583b548fb300 100644
--- a/lite/operators/crf_decoding_op.cc
+++ b/lite/operators/crf_decoding_op.cc
@@ -60,7 +60,7 @@ bool CrfDecodingOpLite::CheckShape() const {
   return true;
 }
 
-bool CrfDecodingOpLite::InferShape() const {
+bool CrfDecodingOpLite::InferShapeImpl() const {
   auto emission_dims = param_.emission->dims();
   if (param_.length == nullptr) {
     param_.viterbi_path->Resize({emission_dims[0], 1});
diff --git a/lite/operators/crf_decoding_op.h b/lite/operators/crf_decoding_op.h
index 6aaf338ec240d2caa659785f909d5eee7d249008..4bc50410ab0504b3e25585caba7f8fff823553b0 100644
--- a/lite/operators/crf_decoding_op.h
+++ b/lite/operators/crf_decoding_op.h
@@ -31,7 +31,7 @@ class CrfDecodingOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/crop_op.cc b/lite/operators/crop_op.cc
index 1a27cfb34d958176c8ad0a6e17d7e17e5287d2d5..4905d92e587ea10783fe7a3cb88b6ee67761c73e 100644
--- a/lite/operators/crop_op.cc
+++ b/lite/operators/crop_op.cc
@@ -26,7 +26,7 @@ bool CropOpLite::CheckShape() const {
   return true;
 }
 
-bool CropOpLite::InferShape() const {
+bool CropOpLite::InferShapeImpl() const {
   // nchw
   auto x_dims = param_.X->dims();
   lite::DDim output_shape(x_dims);
diff --git a/lite/operators/crop_op.h b/lite/operators/crop_op.h
index f21278e891d265093c26be1f96e416974af13b2e..bd3d0e71d8780fab16134ba347f3208249403bd7 100644
--- a/lite/operators/crop_op.h
+++ b/lite/operators/crop_op.h
@@ -30,7 +30,7 @@ class CropOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/ctc_align_op.cc b/lite/operators/ctc_align_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea8e0c27059258a4e7c857c80ab64eb381446035
--- /dev/null
+++ b/lite/operators/ctc_align_op.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/ctc_align_op.h"
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool CtcAlignOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.input != nullptr);
+  CHECK_OR_FALSE(param_.output != nullptr);
+
+  auto* input = param_.input;
+  auto* input_length = param_.input_length;
+  auto input_lod = input->lod();
+  CHECK_OR_FALSE(!input_lod.empty() || input_length != nullptr);
+  return true;
+}
+
+bool CtcAlignOpLite::InferShapeImpl() const {
+  auto input_dims = param_.input->dims();
+  // It is tricky to set the wrong dimension here.
+  param_.output->Resize(input_dims);
+  if (param_.input_length != nullptr && param_.output_length != nullptr) {
+    param_.output_length->Resize({input_dims[0], 1});
+  }
+  return true;
+}
+
+bool CtcAlignOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                lite::Scope* scope) {
+  AttachInput(op_desc, scope, "Input", false, &param_.input);
+  AttachInput(op_desc, scope, "InputLength", true, &param_.input_length);
+  AttachOutput(op_desc, scope, "Output", false, &param_.output);
+  AttachOutput(op_desc, scope, "OutputLength", true, &param_.output_length);
+  param_.blank = op_desc.GetAttr<int>("blank");
+  param_.merge_repeated = op_desc.GetAttr<bool>("merge_repeated");
+  param_.padding_value = op_desc.GetAttr<int>("padding_value");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(ctc_align, paddle::lite::operators::CtcAlignOpLite);
diff --git a/lite/operators/ctc_align_op.h b/lite/operators/ctc_align_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7593860e06c3d0104ca1f7ea7281d23149408923
--- /dev/null
+++ b/lite/operators/ctc_align_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class CtcAlignOpLite : public OpLite {
+ public:
+  CtcAlignOpLite() {}
+
+  explicit CtcAlignOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "ctc_align"; }
+
+ private:
+  mutable CtcAlignParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/decode_bboxes_op.cc b/lite/operators/decode_bboxes_op.cc
index e22adf1774427e10e3fa146e388a6ce365f86021..1903267c3aa46e048787f007a5c9cede8c574c5a 100644
--- a/lite/operators/decode_bboxes_op.cc
+++ b/lite/operators/decode_bboxes_op.cc
@@ -29,7 +29,7 @@ bool DecodeBboxesOpLite::CheckShape() const {
   return true;
 }
 
-bool DecodeBboxesOpLite::InferShape() const {
+bool DecodeBboxesOpLite::InferShapeImpl() const {
   param_.bbox_data->Resize(param_.loc_data->dims());
   return true;
 }
diff --git a/lite/operators/decode_bboxes_op.h b/lite/operators/decode_bboxes_op.h
index c463992c8da6b042d5df027b03e64a594ede8a02..8848a1c26cd9363595a3200fc6e2535751f72df0 100644
--- a/lite/operators/decode_bboxes_op.h
+++ b/lite/operators/decode_bboxes_op.h
@@ -29,7 +29,7 @@ class DecodeBboxesOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/density_prior_box_op.cc b/lite/operators/density_prior_box_op.cc
index 86830df2f19b5615e8b9cfb4b3b57eb22000f588..5ac3eef63bb59c80bffaf3bed558b3ac5baf4d61 100644
--- a/lite/operators/density_prior_box_op.cc
+++ b/lite/operators/density_prior_box_op.cc
@@ -27,7 +27,7 @@ bool DensityPriorBoxOpLite::CheckShape() const {
   return true;
 }
 
-bool DensityPriorBoxOpLite::InferShape() const { return true; }
+bool DensityPriorBoxOpLite::InferShapeImpl() const { return true; }
 
 bool DensityPriorBoxOpLite::AttachImpl(const cpp::OpDesc& opdesc,
                                        lite::Scope* scope) {
diff --git a/lite/operators/density_prior_box_op.h b/lite/operators/density_prior_box_op.h
index bad55ad3b7046da45663a2cdd41243ecd5d41cb0..d84b20557fab101ba60f0af58234ffca4e672a57 100644
--- a/lite/operators/density_prior_box_op.h
+++ b/lite/operators/density_prior_box_op.h
@@ -30,7 +30,7 @@ class DensityPriorBoxOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/distribute_fpn_proposals_op.cc b/lite/operators/distribute_fpn_proposals_op.cc
index 5d6a0fca923dd38fd456e024ec14ba7c2685163d..a23c5e1ffb50b1d22a42d5e68bd424d078e83110 100644
--- a/lite/operators/distribute_fpn_proposals_op.cc
+++ b/lite/operators/distribute_fpn_proposals_op.cc
@@ -32,7 +32,7 @@ bool DistributeFpnProposalsOpLite::CheckShape() const {
   return true;
 }
 
-bool DistributeFpnProposalsOpLite::InferShape() const {
+bool DistributeFpnProposalsOpLite::InferShapeImpl() const {
   int num_out_rois = param_.max_level - param_.min_level + 1;
   for (int i = 0; i < num_out_rois; i++) {
     param_.multi_fpn_rois[i]->Resize({-1, 4});
diff --git a/lite/operators/distribute_fpn_proposals_op.h b/lite/operators/distribute_fpn_proposals_op.h
index 2390e329329f7406f05ba69b3768556f94a02bec..22ab2006e072ea36037cb05faaca324a7d2922c9 100644
--- a/lite/operators/distribute_fpn_proposals_op.h
+++ b/lite/operators/distribute_fpn_proposals_op.h
@@ -32,7 +32,7 @@ class DistributeFpnProposalsOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/dropout_op.cc b/lite/operators/dropout_op.cc
index 03047de3b318ee2221809ee602d94f204568d723..858cc6d9197433985aabfb428993d2fa1333527e 100644
--- a/lite/operators/dropout_op.cc
+++ b/lite/operators/dropout_op.cc
@@ -26,7 +26,7 @@ bool DropoutOp::CheckShape() const {
   return true;
 }
 
-bool DropoutOp::InferShape() const {
+bool DropoutOp::InferShapeImpl() const {
   const auto x_dims = param_.x->dims();
   param_.output->Resize(x_dims);
   if (param_.is_test == false) {
diff --git a/lite/operators/dropout_op.h b/lite/operators/dropout_op.h
index 97e17e350c6a87a82e3cf05635d9575269489d7a..bdf0e1d9046178b48f2b4917840eee6ac8572c5a 100644
--- a/lite/operators/dropout_op.h
+++ b/lite/operators/dropout_op.h
@@ -28,7 +28,7 @@ class DropoutOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
diff --git a/lite/operators/elementwise_grad_ops.cc b/lite/operators/elementwise_grad_ops.cc
index 9d964bf9e36889f2bc72b2656d23bf4022cc121c..730785ba6e6553e6a306f87bdbc63ea5b1017f0a 100644
--- a/lite/operators/elementwise_grad_ops.cc
+++ b/lite/operators/elementwise_grad_ops.cc
@@ -26,7 +26,7 @@ bool ElementwiseGradOp::CheckShape() const {
   return true;
 }
 
-bool ElementwiseGradOp::InferShape() const {
+bool ElementwiseGradOp::InferShapeImpl() const {
   auto x_dim = param_.X->dims();
   auto y_dim = param_.Y->dims();
   if (param_.XGrad) {
diff --git a/lite/operators/elementwise_grad_ops.h b/lite/operators/elementwise_grad_ops.h
index c45d581936207f0b37ee70a0505b912d0b509e35..ca8a3241349b4cdc04e4800a0a88b215f586ba72 100644
--- a/lite/operators/elementwise_grad_ops.h
+++ b/lite/operators/elementwise_grad_ops.h
@@ -27,7 +27,7 @@ class ElementwiseGradOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc
index 044126b3c22fa853d4908c06c307f32278fa5b9b..f4debc39a0d480f38e6d37e8e60d516def7f0b55 100644
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -26,39 +26,8 @@ bool ElementwiseOp::CheckShape() const {
   CHECK_OR_FALSE(param_.Out);
   return true;
 }
-bool ElementwiseOp::SmartInferShape() {
-  if (!last_input_shapes.empty()) {
-    if (last_input_shapes[0] == param_.X->dims() &&
-        last_input_shapes[1] == param_.Y->dims() &&
-        last_input_lods[0] == param_.X->lod() &&
-        last_input_lods[1] == param_.Y->lod()) {
-      param_.Out->Resize(last_output_shapes[0]);
-      param_.Out->set_lod(last_output_lods[0]);
-      return true;
-    }
-  }
-
-  this->InferShape();
-
-  if (!last_input_shapes.empty()) {
-    last_input_shapes.clear();
-    last_input_lods.clear();
-  }
 
-  last_input_shapes.push_back(param_.X->dims());
-  last_input_lods.push_back(param_.X->lod());
-  last_input_shapes.push_back(param_.Y->dims());
-  last_input_lods.push_back(param_.Y->lod());
-
-  if (!last_output_shapes.empty()) {
-    last_output_shapes.clear();
-    last_output_lods.clear();
-  }
-  last_output_shapes.push_back(param_.Out->dims());
-  last_output_lods.push_back(param_.Out->lod());
-  return true;
-}
-bool ElementwiseOp::InferShape() const {
+bool ElementwiseOp::InferShapeImpl() const {
   auto x_dim = param_.X->dims();
   auto y_dim = param_.Y->dims();
   if (x_dim == y_dim) {
@@ -136,7 +105,7 @@ bool ElementwiseOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
 //  return true;
 //}
 
-// bool ElementwiseGradExplicitOp::InferShape() const {
+// bool ElementwiseGradExplicitOp::InferShapeImpl() const {
 //   param_.X_grad->Resize(param_.Out_grad->dims());
 //   if (param_.Y_grad) param_.Y_grad->Resize(param_.Y->dims());
 //   return true;
diff --git a/lite/operators/elementwise_ops.h b/lite/operators/elementwise_ops.h
index 9d6e5781b9754eb22be11da0d7f77b764eb25912..0f1b682fa5f267dd802c5ee0e35aca8f6d68f39c 100644
--- a/lite/operators/elementwise_ops.h
+++ b/lite/operators/elementwise_ops.h
@@ -27,8 +27,7 @@ class ElementwiseOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
-  bool SmartInferShape() override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
@@ -48,7 +47,7 @@ class ElementwiseOp : public OpLite {
 
 //   bool CheckShape() const override;
 
-//   bool InferShape() const override;
+//   bool InferShapeImpl() const override;
 
 //   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
diff --git a/lite/operators/expand_op.cc b/lite/operators/expand_op.cc
index 656e8babc022e3bb022b3c3b4bb066ea5e5d173c..8e40a3b236609b1e83b5224efb462a1f803764df 100644
--- a/lite/operators/expand_op.cc
+++ b/lite/operators/expand_op.cc
@@ -32,7 +32,7 @@ bool ExpandOpLite::CheckShape() const {
   return true;
 }
 
-bool ExpandOpLite::InferShape() const {
+bool ExpandOpLite::InferShapeImpl() const {
   DDim out_dims(param_.X->dims());
   for (size_t i = 0; i < param_.expand_times.size(); ++i) {
     out_dims[i] *= param_.expand_times[i];
diff --git a/lite/operators/expand_op.h b/lite/operators/expand_op.h
index ce5dcda9e80377699b168e6a4970a9bba0cf5039..1312df8e83747107e4c87e856c3b07fc2748d75b 100644
--- a/lite/operators/expand_op.h
+++ b/lite/operators/expand_op.h
@@ -28,7 +28,7 @@ class ExpandOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/fake_channel_wise_dequantize_max_abs.h b/lite/operators/fake_channel_wise_dequantize_max_abs.h
index 43afb7791fe617af0c7ac496cc62a12e6cc548d2..e26d5dda52f8b72d9202067a8782cf1dc10b983e 100644
--- a/lite/operators/fake_channel_wise_dequantize_max_abs.h
+++ b/lite/operators/fake_channel_wise_dequantize_max_abs.h
@@ -36,7 +36,7 @@ class FakeChannelWiseDequantizeMaxAbsOpLite : public OpLite {
 
   bool CheckShape() const override { return true; }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
diff --git a/lite/operators/fake_dequantize_max_abs.h b/lite/operators/fake_dequantize_max_abs.h
index bc266327ebcb14da01201dcc1825367ff7ecd72e..c4bb19c04872078eb997afca6cd7a3cce6923fde 100644
--- a/lite/operators/fake_dequantize_max_abs.h
+++ b/lite/operators/fake_dequantize_max_abs.h
@@ -35,7 +35,7 @@ class FakeDequantizeMaxAbsOpLite : public OpLite {
 
   bool CheckShape() const override { return true; }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
diff --git a/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h b/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h
index 8efa46c41501be79ccc69f4cc9f9646c11673d2d..be7ec60e0eab730c2910c3822c976d579b48d6b7 100644
--- a/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h
+++ b/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h
@@ -36,7 +36,7 @@ class FakeQuantizeDequantizeMovingAvgMaxAbsOpLite : public OpLite {
 
   bool CheckShape() const override { return true; }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
diff --git a/lite/operators/fake_quantize_moving_avg_max_abs.h b/lite/operators/fake_quantize_moving_avg_max_abs.h
index adc62a480d2d2efec54b3822f55a9f66c278e21e..5726231f31eab2012d2cd594c5c26977c71141ff 100644
--- a/lite/operators/fake_quantize_moving_avg_max_abs.h
+++ b/lite/operators/fake_quantize_moving_avg_max_abs.h
@@ -36,7 +36,7 @@ class FakeQuantizeMovingAvgMaxAbsOpLite : public OpLite {
 
   bool CheckShape() const override { return true; }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
diff --git a/lite/operators/fake_quantize_range_abs_max.h b/lite/operators/fake_quantize_range_abs_max.h
index f68d1e20f6e60bb5aa99a2402ea8c9f88aa18470..14f823ece2ee168ae09bc1db67f3d6a7e8c18d5d 100644
--- a/lite/operators/fake_quantize_range_abs_max.h
+++ b/lite/operators/fake_quantize_range_abs_max.h
@@ -36,7 +36,7 @@ class FakeQuantizeRangeMaxAbsOpLite : public OpLite {
 
   bool CheckShape() const override { return true; }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc
index 345fc0d605ccd68e3a6ef72429e20400a772568c..d58a9e5b881048dd47340082fe9c94a618a7a5fb 100644
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -48,34 +48,7 @@ bool FcOpLite::CheckShape() const {
   return true;
 }
 
-bool FcOpLite::SmartInferShape() {
-  if (!last_input_shapes.empty() && !last_output_shapes.empty()) {
-    if (last_input_shapes[0] == param_.input->dims() &&
-        last_input_lods[0] == param_.input->lod()) {
-      param_.output->Resize(last_output_shapes[0]);
-      param_.output->set_lod(last_output_lods[0]);
-      return true;
-    }
-  }
-
-  this->InferShape();
-
-  if (!last_input_shapes.empty()) {
-    last_input_shapes.clear();
-    last_input_lods.clear();
-  }
-  last_input_shapes.push_back(param_.input->dims());
-  last_input_lods.push_back(param_.input->lod());
-  if (!last_output_shapes.empty()) {
-    last_output_shapes.clear();
-    last_output_lods.clear();
-  }
-  last_output_shapes.push_back(param_.output->dims());
-  last_output_lods.push_back(param_.output->lod());
-
-  return true;
-}
-bool FcOpLite::InferShape() const {
+bool FcOpLite::InferShapeImpl() const {
   const auto& input_dims = param_.input->dims();
   const auto& w_dims = param_.w->dims();
   int in_num_col_dims = param_.in_num_col_dims;
diff --git a/lite/operators/fc_op.h b/lite/operators/fc_op.h
index f5dc302e27a220ee1f1e0679cbb3c2ed257747dd..2e6a3ad59a1ca6d2e31f42ceb4b2d1b381c697ee 100644
--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
@@ -35,8 +35,7 @@ class FcOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
-  bool SmartInferShape() override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
 
diff --git a/lite/operators/feed_op.cc b/lite/operators/feed_op.cc
index 8a0c75f62b6bed5767a8cc4b8348b4ca5b59eea5..c429d1f5744e50ff84a0a3d76e2f3e1ba68a0821 100644
--- a/lite/operators/feed_op.cc
+++ b/lite/operators/feed_op.cc
@@ -29,7 +29,7 @@ class FeedOp : public OpLite {
     return true;
   }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/fetch_op.cc b/lite/operators/fetch_op.cc
index d50c0db34084bf8a70c9451ba0f0d8960e9d18c9..9db5fb418dab4418a0d6a622f87620c5c2673ecf 100644
--- a/lite/operators/fetch_op.cc
+++ b/lite/operators/fetch_op.cc
@@ -29,7 +29,7 @@ class FetchOp : public OpLite {
     return true;
   }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
   void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
 
  protected:
diff --git a/lite/operators/fill_constant_batch_size_like_op.cc b/lite/operators/fill_constant_batch_size_like_op.cc
index 7df3a6aa9e75ecc3fe88031a544c8e5ed3d1dd02..5b0ebb38e717afea4dabe011c0161248e2113a02 100644
--- a/lite/operators/fill_constant_batch_size_like_op.cc
+++ b/lite/operators/fill_constant_batch_size_like_op.cc
@@ -28,7 +28,7 @@ bool FillConstantBatchSizeLikeOp::CheckShape() const {
   return true;
 }
 
-bool FillConstantBatchSizeLikeOp::InferShape() const {
+bool FillConstantBatchSizeLikeOp::InferShapeImpl() const {
   std::vector<int64_t> output_dim{param_.shape.begin(), param_.shape.end()};
   if (param_.input_dim_idx == 0 && !param_.input->lod().empty()) {
     output_dim[param_.output_dim_idx] = param_.input->lod().back().size() - 1;
diff --git a/lite/operators/fill_constant_batch_size_like_op.h b/lite/operators/fill_constant_batch_size_like_op.h
index 33cc45779f6132fbc34b33eb2abbe9ca71418046..3c576ab28222c45aa17ba96f5e3e585624a29c02 100644
--- a/lite/operators/fill_constant_batch_size_like_op.h
+++ b/lite/operators/fill_constant_batch_size_like_op.h
@@ -32,7 +32,7 @@ class FillConstantBatchSizeLikeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/fill_constant_op.cc b/lite/operators/fill_constant_op.cc
index 698b787f469375831d937fdf16bb58af06288e71..565c4bbd16e01af340e728e28866268c1a845760 100644
--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
@@ -24,7 +24,7 @@ bool FillConstantOp::CheckShape() const {
   return true;
 }
 
-bool FillConstantOp::InferShape() const {
+bool FillConstantOp::InferShapeImpl() const {
   std::vector<int64_t> out_shape;
   auto shape_tensor = param_.shape_tensor;
   auto shape_tensor_list = param_.shape_tensor_list;
diff --git a/lite/operators/fill_constant_op.h b/lite/operators/fill_constant_op.h
index aa2fea5a665ee9a3c50efa3ec354fe52d9643050..3c0500898bef45efc7a72bc68c82fca9036c63f4 100644
--- a/lite/operators/fill_constant_op.h
+++ b/lite/operators/fill_constant_op.h
@@ -31,7 +31,7 @@ class FillConstantOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/flatten_op.cc b/lite/operators/flatten_op.cc
index 6deab45023876b1a5707ef5cea6ec69af3875328..b270dbf52f9a19f574e6f8967ff93e3a013e5737 100644
--- a/lite/operators/flatten_op.cc
+++ b/lite/operators/flatten_op.cc
@@ -25,7 +25,7 @@ bool FlattenOp::CheckShape() const {
   return true;
 }
 
-bool FlattenOp::InferShape() const {
+bool FlattenOp::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
 
   auto out_lod = param_.output->mutable_lod();
@@ -71,8 +71,8 @@ bool Flatten2Op::CheckShape() const {
   return true;
 }
 
-bool Flatten2Op::InferShape() const {
-  FlattenOp::InferShape();
+bool Flatten2Op::InferShapeImpl() const {
+  FlattenOp::InferShapeImpl();
   auto x_dims = param_.x->dims();
   std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 0);
   for (size_t i = 0; i < x_dims.size(); i++) {
diff --git a/lite/operators/flatten_op.h b/lite/operators/flatten_op.h
index 61680fd3903b77f8826cda6f6a242739720155d7..78b803d765c8513ead9bf482bf23914ac4bf3430 100644
--- a/lite/operators/flatten_op.h
+++ b/lite/operators/flatten_op.h
@@ -30,7 +30,7 @@ class FlattenOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -49,7 +49,7 @@ class Flatten2Op : public FlattenOp {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/fusion_elementwise_activation_ops.cc b/lite/operators/fusion_elementwise_activation_ops.cc
index 244394b95aafede6956bc548430f5c14f28ae910..dfe3bda6c65a75f8b0f8a080d9dc367fb493e6f2 100644
--- a/lite/operators/fusion_elementwise_activation_ops.cc
+++ b/lite/operators/fusion_elementwise_activation_ops.cc
@@ -27,7 +27,7 @@ bool FusionElementwiseActivationOp::CheckShape() const {
   return true;
 }
 
-bool FusionElementwiseActivationOp::InferShape() const {
+bool FusionElementwiseActivationOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
   param_.Out->Resize(param_.X->dims());
   return true;
@@ -59,7 +59,7 @@ bool FusionElementwiseActivationOp::AttachImpl(const cpp::OpDesc& opdesc,
 //   return true;
 // }
 
-// bool FusionElementwiseActivationGradExplicitOp::InferShape() const {
+// bool FusionElementwiseActivationGradExplicitOp::InferShapeImpl() const {
 //   param_.X_grad->Resize(param_.Out_grad->dims());
 //   param_.Y_grad->Resize(param_.Y->dims());
 //   return true;
diff --git a/lite/operators/fusion_elementwise_activation_ops.h b/lite/operators/fusion_elementwise_activation_ops.h
index db521284f0fc96c542fd5e7104b045f83f837f97..738c2168225d86f4614ba8eaaa6c6354f038116c 100644
--- a/lite/operators/fusion_elementwise_activation_ops.h
+++ b/lite/operators/fusion_elementwise_activation_ops.h
@@ -29,7 +29,7 @@ class FusionElementwiseActivationOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
@@ -51,7 +51,7 @@ class FusionElementwiseActivationOp : public OpLite {
 
 //   bool CheckShape() const override;
 
-//   bool InferShape() const override;
+//   bool InferShapeImpl() const override;
 
 //   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
diff --git a/lite/operators/gather_op.cc b/lite/operators/gather_op.cc
index 858dad8e4c4b623e8d2499019bba36c7e0373b60..670cd61c8ea5af2f29a908b5d49bedccaff93c0a 100644
--- a/lite/operators/gather_op.cc
+++ b/lite/operators/gather_op.cc
@@ -26,7 +26,7 @@ bool GatherOp::CheckShape() const {
   return true;
 }
 
-bool GatherOp::InferShape() const {
+bool GatherOp::InferShapeImpl() const {
   auto index_dims = param_.Index->dims();
   CHECK(index_dims.size() == 1 ||
         (index_dims.size() == 2 && index_dims[1] == 1))
diff --git a/lite/operators/gather_op.h b/lite/operators/gather_op.h
index 58d5a30ffbb5f563503c8934d8c9e40bb539d5df..d2072c3a6d6e6e0b100ab3bb9413da8cd4f51f6b 100644
--- a/lite/operators/gather_op.h
+++ b/lite/operators/gather_op.h
@@ -30,7 +30,7 @@ class GatherOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/generate_proposals_op.cc b/lite/operators/generate_proposals_op.cc
index a29ef65e97ccfdaaaf20d6cbbb411fc69cee6f54..48e709c348974dcf1868a7a17425b4168f04b4f6 100644
--- a/lite/operators/generate_proposals_op.cc
+++ b/lite/operators/generate_proposals_op.cc
@@ -43,7 +43,7 @@ bool GenerateProposalsOpLite::CheckShape() const {
   return true;
 }
 
-bool GenerateProposalsOpLite::InferShape() const {
+bool GenerateProposalsOpLite::InferShapeImpl() const {
   param_.RpnRois->Resize(std::vector<int64_t>({-1, 4}));
   param_.RpnRoiProbs->Resize(std::vector<int64_t>({-1, 1}));
   return true;
diff --git a/lite/operators/generate_proposals_op.h b/lite/operators/generate_proposals_op.h
index 502bcca1a3276fbbcc2f05bf8b38fcf2d1bbb024..35dee1966bda7cd9e865f42113c7a92061a3782a 100644
--- a/lite/operators/generate_proposals_op.h
+++ b/lite/operators/generate_proposals_op.h
@@ -32,7 +32,7 @@ class GenerateProposalsOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/grid_sampler_op.cc b/lite/operators/grid_sampler_op.cc
index 2b13d17da7c439f582f682a74b1590cda632cf78..97e2b36a6bcd0eb784a39ab4f2a2e0703d7a7c93 100644
--- a/lite/operators/grid_sampler_op.cc
+++ b/lite/operators/grid_sampler_op.cc
@@ -42,7 +42,7 @@ bool GridSamplerOp::CheckShape() const {
   return true;
 }
 
-bool GridSamplerOp::InferShape() const {
+bool GridSamplerOp::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
   param_.out->Resize(x_dims);
   return true;
diff --git a/lite/operators/grid_sampler_op.h b/lite/operators/grid_sampler_op.h
index 035e1b834510affefacafad763d75d6fbf53aed9..2fba4fe69311c274765e9db4c9b27e137c78a3ee 100644
--- a/lite/operators/grid_sampler_op.h
+++ b/lite/operators/grid_sampler_op.h
@@ -31,7 +31,7 @@ class GridSamplerOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/gru_op.cc b/lite/operators/gru_op.cc
index eb97d65a1a213e31b23087d1ca5c8e963ecf9bbb..862a1ff98f699393c9aa91afab978f947cc25187 100644
--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
@@ -51,7 +51,7 @@ bool GRUOpLite::CheckShape() const {
   return true;
 }
 
-bool GRUOpLite::InferShape() const {
+bool GRUOpLite::InferShapeImpl() const {
   const auto& input_dims = param_.input->dims();
   const auto& weight_dims = param_.weight->dims();
   int frame_size = weight_dims[0];
diff --git a/lite/operators/gru_op.h b/lite/operators/gru_op.h
index c43f32f0cd41b8fa9bc8a541c48523a4f120009d..34f87fa79371fc3d798a57b4aae0945a27a692c3 100644
--- a/lite/operators/gru_op.h
+++ b/lite/operators/gru_op.h
@@ -30,7 +30,7 @@ class GRUOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/gru_unit_op.cc b/lite/operators/gru_unit_op.cc
index ed33507fc3fa61fce1e718581309ae37992c0531..ad025fbbc19cf27f053d5cc2bda566f186a72529 100644
--- a/lite/operators/gru_unit_op.cc
+++ b/lite/operators/gru_unit_op.cc
@@ -51,7 +51,7 @@ bool GRUUnitOpLite::CheckShape() const {
   return true;
 }
 
-bool GRUUnitOpLite::InferShape() const {
+bool GRUUnitOpLite::InferShapeImpl() const {
   auto input_dims = param_.input->dims();
   auto hidden_prev_dims = param_.hidden_prev->dims();
   auto weight_dims = param_.weight->dims();
diff --git a/lite/operators/gru_unit_op.h b/lite/operators/gru_unit_op.h
index 301a7e7323afaea16dce2adcb356a41a8b0b8cac..2785e60e95b0f36cc5bf92714af857ef658d80dc 100644
--- a/lite/operators/gru_unit_op.h
+++ b/lite/operators/gru_unit_op.h
@@ -30,7 +30,7 @@ class GRUUnitOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/im2sequence_op.cc b/lite/operators/im2sequence_op.cc
index 40ab2106af85b3386f93385785b65b9293b1c7f9..ae7b1029468ddb9f723de522ce715859d9a08a09 100644
--- a/lite/operators/im2sequence_op.cc
+++ b/lite/operators/im2sequence_op.cc
@@ -26,7 +26,7 @@ inline int Im2SeqOutputSize(
 }
 
 bool Im2SequenceOp::CheckShape() const { return true; }
-bool Im2SequenceOp::InferShape() const {
+bool Im2SequenceOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto input_dims = param_.X->dims();
diff --git a/lite/operators/im2sequence_op.h b/lite/operators/im2sequence_op.h
index 83a347c913fd80c3a890053e1e1945b6cf2a7cd4..62525baaf071bb92b79773c248adb4fd1c798d90 100644
--- a/lite/operators/im2sequence_op.h
+++ b/lite/operators/im2sequence_op.h
@@ -30,7 +30,7 @@ class Im2SequenceOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/increment_op.cc b/lite/operators/increment_op.cc
index c1928ccbd4ca28ad1d1d83d2e232234ca1677aaa..9b34e4f73b8cc0e27cab06547d3fab84c7033b88 100644
--- a/lite/operators/increment_op.cc
+++ b/lite/operators/increment_op.cc
@@ -25,7 +25,7 @@ bool IncrementOp::CheckShape() const {
   return true;
 }
 
-bool IncrementOp::InferShape() const {
+bool IncrementOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto out_dims = param_.X->dims();
diff --git a/lite/operators/increment_op.h b/lite/operators/increment_op.h
index f180d527c31494dcfb8cb53f005861ae639c9844..d4e6fd6b1ff1aea47df130d510bc84ab0a0b6019 100644
--- a/lite/operators/increment_op.h
+++ b/lite/operators/increment_op.h
@@ -30,7 +30,7 @@ class IncrementOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/instance_norm_op.cc b/lite/operators/instance_norm_op.cc
index 510402ba1fb363f383b3cba8eb322a4ff7975c18..5f685ccfc59a7170a2d29d2b8e561ed933c8517c 100644
--- a/lite/operators/instance_norm_op.cc
+++ b/lite/operators/instance_norm_op.cc
@@ -42,7 +42,7 @@ bool InstanceNormOp::CheckShape() const {
   return true;
 }
 
-bool InstanceNormOp::InferShape() const {
+bool InstanceNormOp::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
   int64_t batch_size = x_dims[0];
   int64_t channel_size = x_dims[1];
diff --git a/lite/operators/instance_norm_op.h b/lite/operators/instance_norm_op.h
index d128345805cf77ac2a4123a8549c92051593fff0..94a1f69fa4433072a986f1d82d5f1b8401a03386 100644
--- a/lite/operators/instance_norm_op.h
+++ b/lite/operators/instance_norm_op.h
@@ -31,7 +31,7 @@ class InstanceNormOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/interpolate_op.cc b/lite/operators/interpolate_op.cc
index 1bfb20df4e4b9762e93b6a39f0d34eb2521acfe0..0ef22e42903842ac41e9aca010f78796b5a32fcc 100644
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
@@ -34,7 +34,7 @@ bool InterpolateOp::CheckShape() const {
   return true;
 }
 
-bool InterpolateOp::InferShape() const {
+bool InterpolateOp::InferShapeImpl() const {
   auto X = param_.X;
 
   int n = X->dims()[0];
diff --git a/lite/operators/interpolate_op.h b/lite/operators/interpolate_op.h
index 5fcf4ef594d52a4ac14e5545b195cc51cbf379cf..2bc938964811c57189e45d3b9d892542f9f02e8f 100644
--- a/lite/operators/interpolate_op.h
+++ b/lite/operators/interpolate_op.h
@@ -31,7 +31,7 @@ class InterpolateOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/io_copy_op.cc b/lite/operators/io_copy_op.cc
index 7df636d7b2d877a5539a980080077be785d47505..af53212caae0676526db4ff9cdeec0b71a6e0a88 100644
--- a/lite/operators/io_copy_op.cc
+++ b/lite/operators/io_copy_op.cc
@@ -24,8 +24,9 @@ bool IoCopyOp::CheckShape() const {
   CHECK_OR_FALSE(param_.y);
   return true;
 }
-bool IoCopyOp::InferShape() const {
+bool IoCopyOp::InferShapeImpl() const {
   param_.y->Resize(param_.x->dims());
+  param_.y->set_lod(param_.x->lod());
   return true;
 }
 bool IoCopyOp::Run() { return OpLite::Run(); }
diff --git a/lite/operators/io_copy_op.h b/lite/operators/io_copy_op.h
index 8d6d69d63ed8b7ec289d7935ea28df2482e0cf31..d6922b667d78e3b79a005aae895b9e63dc76fa21 100644
--- a/lite/operators/io_copy_op.h
+++ b/lite/operators/io_copy_op.h
@@ -24,7 +24,7 @@ class IoCopyOp : public OpLite {
  public:
   explicit IoCopyOp(const std::string &type) : OpLite(type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool Run() override;
   std::string DebugString() const override;
 
diff --git a/lite/operators/is_empty_op.cc b/lite/operators/is_empty_op.cc
index ed4c69e64eaae8fdcb8289c5389dcff1df2ea8b5..a62470e4bb7f88d4c441dc8814bba7c4913ab3e4 100644
--- a/lite/operators/is_empty_op.cc
+++ b/lite/operators/is_empty_op.cc
@@ -21,7 +21,7 @@ namespace operators {
 
 bool IsEmptyOp::CheckShape() const { return true; }
 
-bool IsEmptyOp::InferShape() const { return true; }
+bool IsEmptyOp::InferShapeImpl() const { return true; }
 
 bool IsEmptyOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.X =
diff --git a/lite/operators/is_empty_op.h b/lite/operators/is_empty_op.h
index 5bfa0905c7c57110473fde48d78d17947abbb547..14c0830c233a9ff011b00d130bc36054a7ede57a 100644
--- a/lite/operators/is_empty_op.h
+++ b/lite/operators/is_empty_op.h
@@ -30,7 +30,7 @@ class IsEmptyOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/layer_norm_op.cc b/lite/operators/layer_norm_op.cc
index 18ea6cbf281846600273d6e7d462ed43f2e45637..2f50d232e3781e44b8203084382c20872094a263 100644
--- a/lite/operators/layer_norm_op.cc
+++ b/lite/operators/layer_norm_op.cc
@@ -27,7 +27,7 @@ bool LayerNormOp::CheckShape() const {
   return true;
 }
 
-bool LayerNormOp::InferShape() const {
+bool LayerNormOp::InferShapeImpl() const {
   auto out_dims = param_.X->dims();
   param_.Y->Resize(out_dims);
   auto inner_size = out_dims.Flatten2D(param_.begin_norm_axis)[0];
diff --git a/lite/operators/layer_norm_op.h b/lite/operators/layer_norm_op.h
index 297f6bdd402b919b4baa1915135ed909c57cfa0b..6e15d2f599beb14df024f2591b098b128c3af8dd 100644
--- a/lite/operators/layer_norm_op.h
+++ b/lite/operators/layer_norm_op.h
@@ -30,7 +30,7 @@ class LayerNormOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/layout_op.cc b/lite/operators/layout_op.cc
index 01272568045233a90e2aaaffa758e4ce1515d700..4465dfd5d6d49889777b04c3b661cea1e3d3e311 100644
--- a/lite/operators/layout_op.cc
+++ b/lite/operators/layout_op.cc
@@ -24,8 +24,9 @@ bool LayoutOp::CheckShape() const {
   CHECK_OR_FALSE(param_.y);
   return true;
 }
-bool LayoutOp::InferShape() const {
+bool LayoutOp::InferShapeImpl() const {
   param_.y->Resize(param_.x->dims());
+  param_.y->set_lod(param_.x->lod());
   return true;
 }
 bool LayoutOp::Run() { return OpLite::Run(); }
diff --git a/lite/operators/layout_op.h b/lite/operators/layout_op.h
index 216d571d7c37204ec6ef6c513caba726841bcdf2..f51768863bf2e942262f364c271b902922b39cb1 100644
--- a/lite/operators/layout_op.h
+++ b/lite/operators/layout_op.h
@@ -24,7 +24,7 @@ class LayoutOp : public OpLite {
  public:
   explicit LayoutOp(const std::string &type) : OpLite(type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool Run() override;
   std::string DebugString() const override;
 
diff --git a/lite/operators/lod_reset_op.cc b/lite/operators/lod_reset_op.cc
index 1754e709ff2439462e8f40d047f5594ed740e07a..c30c78bbc6c1300660c01e6219c9e5113c39a718 100644
--- a/lite/operators/lod_reset_op.cc
+++ b/lite/operators/lod_reset_op.cc
@@ -25,7 +25,7 @@ bool LodResetOp::CheckShape() const {
   return true;
 }
 
-bool LodResetOp::InferShape() const {
+bool LodResetOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   param_.Out->Resize(param_.X->dims());
diff --git a/lite/operators/lod_reset_op.h b/lite/operators/lod_reset_op.h
index 4e048a9a696c3e1e4a366c732bb269134c9d5d06..8ca2bc578099aabfe6c9649d58e9caeabea7870f 100644
--- a/lite/operators/lod_reset_op.h
+++ b/lite/operators/lod_reset_op.h
@@ -30,7 +30,7 @@ class LodResetOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/logical_op.cc b/lite/operators/logical_op.cc
index 8af982ad535192f4897ea70cdb180b230d29dfd6..2dd5b798280ef80a54d557e449beee15959971b8 100644
--- a/lite/operators/logical_op.cc
+++ b/lite/operators/logical_op.cc
@@ -26,7 +26,7 @@ bool BinaryLogicalOp::CheckShape() const {
   return true;
 }
 
-bool BinaryLogicalOp::InferShape() const {
+bool BinaryLogicalOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto input_dims = param_.X->dims();
@@ -53,7 +53,7 @@ bool UnaryLogicalOp::CheckShape() const {
   return true;
 }
 
-bool UnaryLogicalOp::InferShape() const {
+bool UnaryLogicalOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto input_dims = param_.X->dims();
diff --git a/lite/operators/logical_op.h b/lite/operators/logical_op.h
index a0fc1d68a60a0650179f66ca9fd443e96a483c34..e784d4d99b7de29593e411db9b6a888e5bd52e21 100644
--- a/lite/operators/logical_op.h
+++ b/lite/operators/logical_op.h
@@ -30,7 +30,7 @@ class BinaryLogicalOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -49,7 +49,7 @@ class UnaryLogicalOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/lookup_table_dequant_op.cc b/lite/operators/lookup_table_dequant_op.cc
index b81043bfbfeed356e3d67065686057adfadcb25f..844544dfad3c535342169d08159a80484a29643d 100644
--- a/lite/operators/lookup_table_dequant_op.cc
+++ b/lite/operators/lookup_table_dequant_op.cc
@@ -36,7 +36,7 @@ bool LookupTableDequantOpLite::CheckShape() const {
   return true;
 }
 
-bool LookupTableDequantOpLite::InferShape() const {
+bool LookupTableDequantOpLite::InferShapeImpl() const {
   const auto& table_dims = param_.W->dims();
   const auto& ids_dims = param_.Ids->dims();
 
diff --git a/lite/operators/lookup_table_dequant_op.h b/lite/operators/lookup_table_dequant_op.h
index 3a9683d5ca0d87365cb240b91dccab07cf26ca71..a094cac9a49891294ec71194d39a023867f58052 100644
--- a/lite/operators/lookup_table_dequant_op.h
+++ b/lite/operators/lookup_table_dequant_op.h
@@ -31,7 +31,7 @@ class LookupTableDequantOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/lookup_table_op.cc b/lite/operators/lookup_table_op.cc
index df066435a8758e5a75ad1bed78111396d50b44cf..9bc22080bfb6c0ebda28e620dd9b781ec515ecbb 100644
--- a/lite/operators/lookup_table_op.cc
+++ b/lite/operators/lookup_table_op.cc
@@ -36,7 +36,7 @@ bool LookupTableOpLite::CheckShape() const {
   return true;
 }
 
-bool LookupTableOpLite::InferShape() const {
+bool LookupTableOpLite::InferShapeImpl() const {
   const auto& table_dims = param_.W->dims();
   const auto& ids_dims = param_.Ids->dims();
 
diff --git a/lite/operators/lookup_table_op.h b/lite/operators/lookup_table_op.h
index 2701af984088cfda450f98fa5bc432dad7c2bc59..91ef77cfa1852a93d3aa28aceb616eec3306af3a 100644
--- a/lite/operators/lookup_table_op.h
+++ b/lite/operators/lookup_table_op.h
@@ -30,7 +30,7 @@ class LookupTableOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/lookup_table_v2_op.cc b/lite/operators/lookup_table_v2_op.cc
index df642e6191cffb748191b38eb5a6578aac163da4..8c76090df385ca5adf454ac1918c11c8838695f1 100644
--- a/lite/operators/lookup_table_v2_op.cc
+++ b/lite/operators/lookup_table_v2_op.cc
@@ -32,7 +32,7 @@ bool LookupTableV2OpLite::CheckShape() const {
   return true;
 }
 
-bool LookupTableV2OpLite::InferShape() const {
+bool LookupTableV2OpLite::InferShapeImpl() const {
   auto table_dims = param_.W->dims();
   auto ids_dims = param_.Ids->dims();
 
diff --git a/lite/operators/lookup_table_v2_op.h b/lite/operators/lookup_table_v2_op.h
index dabff3f0cac75cb70cde6eb6e95df34dc36901fe..b0b8829fe6aeaf02a445109ea804266758919822 100644
--- a/lite/operators/lookup_table_v2_op.h
+++ b/lite/operators/lookup_table_v2_op.h
@@ -30,7 +30,7 @@ class LookupTableV2OpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/lrn_op.cc b/lite/operators/lrn_op.cc
index aff3e5af5566771411acf20736fdbec703f5def9..dcaffe1aa7cbc64c26dd2d56fcaa650e1599eb10 100644
--- a/lite/operators/lrn_op.cc
+++ b/lite/operators/lrn_op.cc
@@ -27,7 +27,7 @@ bool LrnOpLite::CheckShape() const {
   return true;
 }
 
-bool LrnOpLite::InferShape() const {
+bool LrnOpLite::InferShapeImpl() const {
   param_.Out->Resize(param_.X->dims());
   return true;
 }
diff --git a/lite/operators/lrn_op.h b/lite/operators/lrn_op.h
index a569a77fb40d7ea60e9e41171e73668e499684a5..13dfdefdc6f28dc289f490340faa14c166485db0 100644
--- a/lite/operators/lrn_op.h
+++ b/lite/operators/lrn_op.h
@@ -28,7 +28,7 @@ class LrnOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/lstm_op.cc b/lite/operators/lstm_op.cc
index 36a0d2f53c1f30976ad6df811ad352721e3d7ff7..d9b6ebfc321190286d27272ea7b09a2a751cd9f1 100644
--- a/lite/operators/lstm_op.cc
+++ b/lite/operators/lstm_op.cc
@@ -26,7 +26,7 @@ bool LstmOp::CheckShape() const {
   return true;
 }
 
-bool LstmOp::InferShape() const {
+bool LstmOp::InferShapeImpl() const {
   auto in_dims = param_.Input->dims();
   if (param_.H0) {
     CHECK(param_.C0) << "lstm must has H0 and C0 in the same time";
diff --git a/lite/operators/lstm_op.h b/lite/operators/lstm_op.h
index 221bd5c37945f4ff65b21a83449937563d9e5944..38bef385da67defa4e3459cfbcb6cbf24e0f2ed9 100644
--- a/lite/operators/lstm_op.h
+++ b/lite/operators/lstm_op.h
@@ -30,7 +30,7 @@ class LstmOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/match_matrix_tensor_op.cc b/lite/operators/match_matrix_tensor_op.cc
index a8095a94bf75cd5d6d9087509449c159056ebc28..1cc751109f76a96097d363b493322dde182a715d 100644
--- a/lite/operators/match_matrix_tensor_op.cc
+++ b/lite/operators/match_matrix_tensor_op.cc
@@ -42,7 +42,7 @@ bool MatchMatrixTensorOpLite::CheckShape() const {
   return true;
 }
 
-bool MatchMatrixTensorOpLite::InferShape() const {
+bool MatchMatrixTensorOpLite::InferShapeImpl() const {
   const Tensor* x = param_.x;
   const Tensor* y = param_.y;
   DDim x_dims = param_.x->dims();
diff --git a/lite/operators/match_matrix_tensor_op.h b/lite/operators/match_matrix_tensor_op.h
index 404183ea5bda3c35ba8b833853bc0005d60b9f7d..f1070a81b471ded59610af1a5bb40e35ccba7aff 100644
--- a/lite/operators/match_matrix_tensor_op.h
+++ b/lite/operators/match_matrix_tensor_op.h
@@ -32,7 +32,7 @@ class MatchMatrixTensorOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/matmul_op.cc b/lite/operators/matmul_op.cc
index 286ade7b2130ce662eea2b7ba4e142bf489306ca..04a0fc97d77a181e45e3e829010934e22381ae12 100644
--- a/lite/operators/matmul_op.cc
+++ b/lite/operators/matmul_op.cc
@@ -24,19 +24,12 @@ bool MatMulOpLite::CheckShape() const {
   CHECK_OR_FALSE(param_.Y);
   CHECK_OR_FALSE(param_.Out);
 
-  return true;
-}
-
-bool MatMulOpLite::InferShape() const {
   const auto x_dims = param_.X->dims();
   const auto y_dims = param_.Y->dims();
   bool x_transpose = param_.transpose_X;
   bool y_transpose = param_.transpose_Y;
-  std::vector<int64_t> dim_out_vec;
 
-  if (x_dims.size() > 2 && y_dims.size() >= 2) {
-    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
-    // x: [B, M, K], y: [K, N], out: [B, M, N]
+  if (x_dims.size() > 1 && y_dims.size() > 1) {
     if (!x_transpose && !y_transpose) {
       CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 2])
           << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
@@ -54,48 +47,49 @@ bool MatMulOpLite::InferShape() const {
           << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
           << ")";
     }
+  } else if (x_dims.size() > 2 && y_dims.size() == 1) {
+    CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[0])
+        << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+        << ")";
+  }
+  return true;
+}
 
-    dim_out_vec.resize(x_dims.size());
-    for (size_t i = 0; i < x_dims.size() - 2; ++i) {
-      dim_out_vec[i] = x_dims[i];
+bool MatMulOpLite::InferShapeImpl() const {
+  const auto x_dims = param_.X->dims();
+  const auto y_dims = param_.Y->dims();
+  bool x_transpose = param_.transpose_X;
+  bool y_transpose = param_.transpose_Y;
+  std::vector<int64_t> dim_out_vec;
+
+  if ((x_dims.size() >= 2 && y_dims.size() >= 2) &&
+      (x_dims.size() != 2 || y_dims.size() != 2)) {
+    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [B, M, K], y: [K, N], out: [B, M, N]
+    // or
+    // x: [M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [M, K], y: [B, K, N], out: [B, M, N]
+    DDim dims = x_dims.size() >= y_dims.size() ? x_dims : y_dims;
+    dim_out_vec.resize(dims.size());
+    for (size_t i = 0; i < dims.size() - 2; ++i) {
+      dim_out_vec[i] = dims[i];
     }
     if (!x_transpose && !y_transpose) {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 2];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 1];
+      dim_out_vec[dims.size() - 2] = x_dims[x_dims.size() - 2];
+      dim_out_vec[dims.size() - 1] = y_dims[y_dims.size() - 1];
     } else if (!x_transpose && y_transpose) {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 2];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 2];
+      dim_out_vec[dims.size() - 2] = x_dims[x_dims.size() - 2];
+      dim_out_vec[dims.size() - 1] = y_dims[y_dims.size() - 2];
     } else if (x_transpose && !y_transpose) {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 1];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 1];
+      dim_out_vec[dims.size() - 2] = x_dims[x_dims.size() - 1];
+      dim_out_vec[dims.size() - 1] = y_dims[y_dims.size() - 1];
     } else {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 1];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 2];
+      dim_out_vec[dims.size() - 2] = x_dims[x_dims.size() - 1];
+      dim_out_vec[dims.size() - 1] = y_dims[y_dims.size() - 2];
     }
   } else if (x_dims.size() == 2 && y_dims.size() == 2) {
     // x: [M, K], y: [K, N], out: [M, N]
     // x: [M, K], y: [K, N], out: [M, N]
-    if (!x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (!x_transpose && y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[0], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else {
-      CHECK_EQ(x_dims[0], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    }
     dim_out_vec.resize(x_dims.size());
     if (x_transpose) {
       dim_out_vec[0] = x_dims[1];
@@ -109,9 +103,6 @@ bool MatMulOpLite::InferShape() const {
     }
   } else if (x_dims.size() > 2 && y_dims.size() == 1) {
     // x: [B, M, K], y: [K], out: [B, M]
-    CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[0])
-        << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-        << ")";
     dim_out_vec.resize(x_dims.size() - 1);
     for (size_t i = 0; i < dim_out_vec.size(); ++i) {
       dim_out_vec[i] = x_dims[i];
diff --git a/lite/operators/matmul_op.h b/lite/operators/matmul_op.h
index 0aa47c89dd2227f70e7264c39b13c019d9b00587..acb9d512f7ac50818e9521ca67e04318397dabb0 100644
--- a/lite/operators/matmul_op.h
+++ b/lite/operators/matmul_op.h
@@ -33,7 +33,7 @@ class MatMulOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/mean_grad_op.cc b/lite/operators/mean_grad_op.cc
index fd17cac14fca153499a52e93f6f09ea44ea9a559..55e374735ea8d861c65f1296968a40a8b5b1f096 100644
--- a/lite/operators/mean_grad_op.cc
+++ b/lite/operators/mean_grad_op.cc
@@ -28,7 +28,7 @@ bool MeanGradOp::CheckShape() const {
   return true;
 }
 
-bool MeanGradOp::InferShape() const {
+bool MeanGradOp::InferShapeImpl() const {
   param_.X_grad->Resize(param_.X->dims());
   return true;
 }
diff --git a/lite/operators/mean_grad_op.h b/lite/operators/mean_grad_op.h
index 1bd604518bfc088fc45566e393fd997ae4eed06e..488581a71bb423c09540d17cbb05c170f6f06374 100644
--- a/lite/operators/mean_grad_op.h
+++ b/lite/operators/mean_grad_op.h
@@ -27,7 +27,7 @@ class MeanGradOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
 
diff --git a/lite/operators/mean_op.cc b/lite/operators/mean_op.cc
index 618e9001db056b935de6aef8feff9125155d0e1a..9a66d4fbda3116ef7bd751f34f66eefd1f2e6e99 100644
--- a/lite/operators/mean_op.cc
+++ b/lite/operators/mean_op.cc
@@ -27,7 +27,7 @@ bool MeanOp::CheckShape() const {
   return true;
 }
 
-bool MeanOp::InferShape() const {
+bool MeanOp::InferShapeImpl() const {
   param_.Out->Resize(std::vector<int64_t>{1});
   return true;
 }
diff --git a/lite/operators/mean_op.h b/lite/operators/mean_op.h
index 8526842f93cb1d01debad9c6cb28ec28b98e43e9..c4dff93ce78aa4598bd12fb3181aa5f2bd4820b6 100644
--- a/lite/operators/mean_op.h
+++ b/lite/operators/mean_op.h
@@ -27,7 +27,7 @@ class MeanOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/merge_lod_tensor_op.cc b/lite/operators/merge_lod_tensor_op.cc
index 4258715b1d1aa6bf7fac160dcd6fc8ca6dd3754d..704b5cad6fc80bee8bcb5dfd2921c5cf87182ff8 100644
--- a/lite/operators/merge_lod_tensor_op.cc
+++ b/lite/operators/merge_lod_tensor_op.cc
@@ -34,7 +34,7 @@ bool MergeLodTensorOpLite::CheckShape() const {
   return true;
 }
 
-bool MergeLodTensorOpLite::InferShape() const {
+bool MergeLodTensorOpLite::InferShapeImpl() const {
   auto dims = param_.in_true->dims();
   param_.out->Resize(dims);
   return true;
diff --git a/lite/operators/merge_lod_tensor_op.h b/lite/operators/merge_lod_tensor_op.h
index 788a3451685cd0f42b72ee01e93e17da49507957..ec986fac1988efb5efa262c9fc340c6b450f8ddf 100644
--- a/lite/operators/merge_lod_tensor_op.h
+++ b/lite/operators/merge_lod_tensor_op.h
@@ -31,7 +31,7 @@ class MergeLodTensorOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/mul_grad_op.cc b/lite/operators/mul_grad_op.cc
index 8215521637cbc29a4bdcc4b735b9658fc4cc4840..51e1fb310cb12d83dda9436bb73042c7b22fae11 100644
--- a/lite/operators/mul_grad_op.cc
+++ b/lite/operators/mul_grad_op.cc
@@ -46,7 +46,7 @@ bool MulGradOpLite::CheckShape() const {
   return true;
 }
 
-bool MulGradOpLite::InferShape() const {
+bool MulGradOpLite::InferShapeImpl() const {
   const auto x_dims = param_.x->dims();
   const auto y_dims = param_.y->dims();
   if (param_.x_grad) {
diff --git a/lite/operators/mul_grad_op.h b/lite/operators/mul_grad_op.h
index ef61f54f9b88cd691ab98c4d8904b848dcea66b5..869aa60c6232000008cb57d110aa454396b2ff34 100644
--- a/lite/operators/mul_grad_op.h
+++ b/lite/operators/mul_grad_op.h
@@ -33,7 +33,7 @@ class MulGradOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/mul_op.cc b/lite/operators/mul_op.cc
index c870abdc8989b48d8aa2f14f989ad475c027995e..8641a041e38b7a85ee7f0af8b3536f0b9224b36f 100644
--- a/lite/operators/mul_op.cc
+++ b/lite/operators/mul_op.cc
@@ -35,7 +35,7 @@ bool MulOpLite::CheckShape() const {
   return true;
 }
 
-bool MulOpLite::InferShape() const {
+bool MulOpLite::InferShapeImpl() const {
   const auto x_dims = param_.x->dims();
   const auto y_dims = param_.y->dims();
 
diff --git a/lite/operators/mul_op.h b/lite/operators/mul_op.h
index caf7bf6ae902ac4e4f22d4a9aadfa108fa7622da..10a2e2efaa4db0e106e3c56c2f9b1cec9fb55ac4 100644
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
@@ -33,7 +33,7 @@ class MulOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
diff --git a/lite/operators/multiclass_nms_op.cc b/lite/operators/multiclass_nms_op.cc
index 9ec79f8b57d63f20325bf686c1280522aa4fa80a..8e47a828c13f3ca73f033cd0422f8c05be857cbe 100644
--- a/lite/operators/multiclass_nms_op.cc
+++ b/lite/operators/multiclass_nms_op.cc
@@ -41,15 +41,9 @@ bool MulticlassNmsOpLite::CheckShape() const {
   return true;
 }
 
-bool MulticlassNmsOpLite::InferShape() const {
-  auto box_dims = param_.bboxes->dims();
-  auto score_dims = param_.scores->dims();
-  auto score_size = score_dims.size();
-  if (score_size == 3) {
-    param_.out->Resize({box_dims[1], box_dims[2], 3});
-  } else {
-    param_.out->Resize({-1, box_dims[2] + 2});
-  }
+bool MulticlassNmsOpLite::InferShapeImpl() const {
+  // InferShape is useless for multiclass_nms
+  // out's dim is not sure before the end of calculation
   return true;
 }
 
diff --git a/lite/operators/multiclass_nms_op.h b/lite/operators/multiclass_nms_op.h
index 7be0d17d7478bdcfb4c4c6b1f22e505fb9da0846..f74479f3c9a42e6f5ec06126fedf91a2e17b6c2f 100644
--- a/lite/operators/multiclass_nms_op.h
+++ b/lite/operators/multiclass_nms_op.h
@@ -29,7 +29,7 @@ class MulticlassNmsOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/negative_op.cc b/lite/operators/negative_op.cc
index 4db1dd4feede42fc4267eb3fc3553c538807f1a8..2b98f0a90af812ac9c524368e41177377f4d69e2 100644
--- a/lite/operators/negative_op.cc
+++ b/lite/operators/negative_op.cc
@@ -26,7 +26,7 @@ bool NegativeOpLite::CheckShape() const {
   return true;
 }
 
-bool NegativeOpLite::InferShape() const {
+bool NegativeOpLite::InferShapeImpl() const {
   lite::DDim input_dims;
   input_dims = param_.X->dims();
   param_.Out->Resize(lite::DDim(input_dims));
diff --git a/lite/operators/negative_op.h b/lite/operators/negative_op.h
index 83f1008c9630284956347b87151e58f49588b867..04ec92532559c050cc5a9e8ac6bdf9a817e0dc70 100644
--- a/lite/operators/negative_op.h
+++ b/lite/operators/negative_op.h
@@ -30,7 +30,7 @@ class NegativeOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/norm_op.cc b/lite/operators/norm_op.cc
index dff26966d48889389e2837194c2bc5a96fc960e5..0513e5c942d73397f269f1fe7bb89572a97ae548 100644
--- a/lite/operators/norm_op.cc
+++ b/lite/operators/norm_op.cc
@@ -25,7 +25,7 @@ bool NormOp::CheckShape() const {
   return true;
 }
 
-bool NormOp::InferShape() const {
+bool NormOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto out_dims = param_.X->dims();
diff --git a/lite/operators/norm_op.h b/lite/operators/norm_op.h
index ae4594ed023d47179a7125bd9183e39f505ae16b..5c69d959be81eaccddc396dadacf920493ef99f5 100644
--- a/lite/operators/norm_op.h
+++ b/lite/operators/norm_op.h
@@ -30,7 +30,7 @@ class NormOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 36d3b42c6b315a3858f475bd5756579137528051..30ee736de494e1a93902d1252db2672aeef38f2e 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -24,6 +24,7 @@
 #include "lite/model_parser/cpp/block_desc.h"
 #include "lite/model_parser/desc_apis.h"
 #include "lite/utils/all.h"
+#include "lite/utils/variant.h"
 /*
  * This file contains all the argument parameter data structure for operators.
  */
@@ -32,47 +33,57 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
+struct ParamBase {
+ public:
+  const std::vector<Tensor*>* input_tensor_ptrs() const { return nullptr; }
+  std::vector<Tensor*>* output_tensor_ptrs() { return nullptr; }
+
+ protected:
+  std::shared_ptr<std::vector<const Tensor*>> input_tensor_ptrs_cache_{nullptr};
+  std::shared_ptr<std::vector<Tensor*>> output_tensor_ptrs_cache_{nullptr};
+};
+
 using param_t = Any;
 #define WITH_INT8_CONFIG             \
   bool enable_int8{false};           \
-  float input_scale{1.0};            \
+  float input_scale{1.0f};           \
   std::vector<float> weight_scale{}; \
-  float output_scale{1.0};           \
+  float output_scale{1.0f};          \
   int bit_length{8};
 
 /// ----------------------- Functional operators ------------------------------
-struct FeedParam {
+struct FeedParam : ParamBase {
   std::vector<lite::Tensor>* feed_list{};
   lite::Tensor* out{};
   int col;
 };
 
-struct FetchParam {
+struct FetchParam : ParamBase {
   const lite::Tensor* input{};
   std::vector<lite::Tensor>* fetch_list{};
   int col;
 };
 
 // Helper op for lite framework
-struct IoCopyParam {
+struct IoCopyParam : ParamBase {
   const lite::Tensor* x{};
   lite::Tensor* y{};
   int process_type{0};
 };
 
-struct LayoutParam {
+struct LayoutParam : ParamBase {
   const lite::Tensor* x{};
   lite::Tensor* y{};
   int process_type{0};
 };
 
-struct CalibParam {
+struct CalibParam : ParamBase {
   const lite::Tensor* input{};
   lite::Tensor* output{};
   float scale;
 };
 
-struct SubgraphParam {
+struct SubgraphParam : ParamBase {
   std::vector<std::string> input_names{};
   std::vector<std::string> output_names{};
   std::vector<std::string> input_data_names{};
@@ -84,7 +95,7 @@ struct SubgraphParam {
 
 /// -------------------------- NN operators ------------------------------------
 
-struct FcParam {
+struct FcParam : ParamBase {
   lite::Tensor* input{nullptr};
   lite::Tensor* w{nullptr};
   lite::Tensor* bias{nullptr};
@@ -95,9 +106,24 @@ struct FcParam {
   bool padding_weights{false};
   // for int8
   WITH_INT8_CONFIG
-};
-
-struct SearchSeqFcParam {
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({input}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
+};
+
+struct SearchSeqFcParam : ParamBase {
   lite::Tensor* x{nullptr};
   lite::Tensor* w{nullptr};
   lite::Tensor* b{nullptr};
@@ -106,7 +132,7 @@ struct SearchSeqFcParam {
 };
 
 // For Interpolate Op
-struct InterpolateParam {
+struct InterpolateParam : ParamBase {
   lite::Tensor* X{};
   lite::Tensor* OutSize{};
   lite::Tensor* Out{};
@@ -123,7 +149,7 @@ struct InterpolateParam {
 };
 
 // For Mul Op
-struct MulParam {
+struct MulParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* y{};
   lite::Tensor* output{};
@@ -132,9 +158,24 @@ struct MulParam {
   int y_num_col_dims{1};
   // for int8
   WITH_INT8_CONFIG
-};
-
-struct MulGradParam {
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x, y}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
+};
+
+struct MulGradParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* y{};
   const lite::Tensor* output_grad{};
@@ -146,7 +187,7 @@ struct MulGradParam {
 };
 
 // For ReduceMean Op
-struct ReduceMeanParam {
+struct ReduceMeanParam : ParamBase {
   lite::Tensor* X{};
   lite::Tensor* Out{};
 
@@ -155,7 +196,7 @@ struct ReduceMeanParam {
 };
 
 // For Stack Op
-struct StackParam {
+struct StackParam : ParamBase {
   std::vector<lite::Tensor*> X;
   lite::Tensor* Out{};
 
@@ -163,7 +204,7 @@ struct StackParam {
 };
 
 // For Power Op
-struct PowerParam {
+struct PowerParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 
@@ -172,7 +213,7 @@ struct PowerParam {
   float power{};
 };
 
-struct ShuffleChannelParam {
+struct ShuffleChannelParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 
@@ -180,7 +221,7 @@ struct ShuffleChannelParam {
 };
 
 // For Yolobox
-struct YoloBoxParam {
+struct YoloBoxParam : ParamBase {
   lite::Tensor* X{};
   lite::Tensor* ImgSize{};
   lite::Tensor* Boxes{};
@@ -193,24 +234,54 @@ struct YoloBoxParam {
 };
 
 // For Scale Op
-struct ScaleParam {
+struct ScaleParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* output{};
 
   float scale{1.};
   float bias{};
   bool bias_after_scale{true};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Softmax op
-struct SoftmaxParam {
+struct SoftmaxParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* output{};
   int axis{-1};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Reshape and Reshape2 Op
-struct ReshapeParam {
+struct ReshapeParam : ParamBase {
   const lite::Tensor* x{};
   std::vector<const lite::Tensor*> shape_tensor_vct{};
   const lite::Tensor* shape_tensor{};
@@ -219,33 +290,71 @@ struct ReshapeParam {
 
   lite::Tensor* xshape{};
   bool inplace{false};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Concat op
-struct ConcatParam {
+struct ConcatParam : ParamBase {
   std::vector<lite::Tensor*> x{};
   lite::Tensor* output{};
   int axis{0};
   lite::Tensor* axis_tensor{};
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      std::vector<const Tensor*> vec;
+      for (auto in : x) {
+        vec.push_back(in);
+      }
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>(vec));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 /// ----------------------- activation operators ----------------------
-struct ActivationParam {
+struct ActivationParam : ParamBase {
   const lite::Tensor* X{};
+  lite::Tensor* Out{};
+  lite_api::ActivationType active_type;
+  bool has_active{false};
   float Leaky_relu_alpha{0};   // leaky_relu param
   float Relu_clipped_coef{6};  // relu_clipped param
   std::string Prelu_mode{
       "channel"};  // prelu param, can be "all", "channel" or "element"
   lite::Tensor* Prelu_alpha{};  // prelu param
   float Swish_beta;             // swish param
-  float hard_sigmoid_slope{0.2};
-  float hard_sigmoid_offset{0.5};
-  lite::Tensor* Out{};
-  bool has_active{false};
-  lite_api::ActivationType active_type;
+  // hard_sigmoid param
+  float hard_sigmoid_slope{0.2f};
+  float hard_sigmoid_offset{0.5f};
+  // hard_swish param
+  float hard_swish_threshold{6.0};
+  float hard_swish_scale{6.0};
+  float hard_swish_offset{3.0};
 };
 
-struct ActivationGradParam {
+struct ActivationGradParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Out{};
   // for backward
@@ -254,7 +363,7 @@ struct ActivationGradParam {
 };
 
 // For Convolution op
-struct ConvParam {
+struct ConvParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* filter{};
   lite::Tensor* bias{nullptr};
@@ -294,10 +403,26 @@ struct ConvParam {
   std::vector<int> output_size;
   // for int8
   WITH_INT8_CONFIG
+
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For BatchNorm op
-struct BatchNormParam {
+struct BatchNormParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* bias{};
   lite::Tensor* scale{};
@@ -313,10 +438,25 @@ struct BatchNormParam {
   float epsilon;
   float momentum;
   DataLayoutType data_layout{DATALAYOUT(kNCHW)};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({y}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Pooling op
-struct PoolParam {
+struct PoolParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* output{};
   std::string pooling_type{""};
@@ -337,10 +477,25 @@ struct PoolParam {
   std::string data_format{"AnyLayout"};
   // for int8
   WITH_INT8_CONFIG
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Dropout op
-struct DropoutParam {
+struct DropoutParam : ParamBase {
   const lite::Tensor* x{};
   lite::Tensor* output{};
   lite::Tensor* mask{};
@@ -352,7 +507,7 @@ struct DropoutParam {
 };
 
 // For Split op
-struct SplitParam {
+struct SplitParam : ParamBase {
   lite::Tensor* x{};
   std::vector<lite::Tensor*> output{};
   lite::Tensor* axis_tensor;
@@ -361,10 +516,25 @@ struct SplitParam {
   int axis{-1};
   int num{0};
   std::vector<int> sections;
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Transpose op
-struct TransposeParam {
+struct TransposeParam : ParamBase {
   const lite::Tensor* x{};
   lite::Tensor* output{};
   lite::Tensor* xshape{};
@@ -372,10 +542,25 @@ struct TransposeParam {
   std::vector<int> axis;
   bool use_mkldnn{false};
   std::string data_format{"AnyLayout"};
+  ///////////////////////////////////////////////////////////////////////////////////
+  //  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 /// ----------------------- element wise operators ----------------------
-struct ElementwiseParam {
+struct ElementwiseParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
@@ -384,9 +569,24 @@ struct ElementwiseParam {
   WITH_INT8_CONFIG
   float x_input_scale{1.0};
   float y_input_scale{1.0};
-};
-
-struct ElementwiseGradParam {
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X, Y}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
+};
+
+struct ElementwiseGradParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   const lite::Tensor* OutGrad{};
@@ -404,12 +604,12 @@ struct FusionElementwiseActivationGradParam : public ElementwiseGradParam {
 };
 
 /// ----------------------- mean operators ----------------------
-struct MeanParam {
+struct MeanParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 
-struct MeanGradParam {
+struct MeanGradParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Out_grad{};
   // for backward
@@ -417,7 +617,7 @@ struct MeanGradParam {
 };
 
 /// ----------------------- fill_constant operators ----------------------
-struct FillConstantParam {
+struct FillConstantParam : ParamBase {
   int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
   std::vector<int64_t> shape{};
   lite::Tensor* shape_tensor{nullptr};
@@ -429,7 +629,7 @@ struct FillConstantParam {
   lite::Tensor* out{};
 };
 
-struct FillConstantBatchSizeLikeParam {
+struct FillConstantBatchSizeLikeParam : ParamBase {
   const lite::Tensor* input{nullptr};
   lite::Tensor* out{nullptr};
 
@@ -443,7 +643,7 @@ struct FillConstantBatchSizeLikeParam {
 };
 
 //
-struct FakeQuantizeMovingAvgMaxAbsParam {
+struct FakeQuantizeMovingAvgMaxAbsParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* in_scale{};
   const lite::Tensor* in_accum{};
@@ -454,17 +654,17 @@ struct FakeQuantizeMovingAvgMaxAbsParam {
   lite::Tensor* out_accum{};
   int bit_length;
   bool is_test{true};
-  float moving_rate{0.9};
+  float moving_rate{0.9f};
 };
 
-struct FakeDequantizeMaxAbsParam {
+struct FakeDequantizeMaxAbsParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* in_scale{};
   lite::Tensor* out{};
   float max_range;
 };
 
-struct FakeChannelWiseDequantizeMaxAbsParam {
+struct FakeChannelWiseDequantizeMaxAbsParam : ParamBase {
   const lite::Tensor* x{};
   std::vector<const lite::Tensor*> scale_tensors{};
   lite::Tensor* out{};
@@ -472,7 +672,7 @@ struct FakeChannelWiseDequantizeMaxAbsParam {
 };
 
 /// ----------------------- sgd operators ----------------------
-struct SGDParam {
+struct SGDParam : ParamBase {
   int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
 
   const lite::Tensor* Param{};
@@ -482,7 +682,7 @@ struct SGDParam {
 };
 
 /// ----------------------- uniform_random operators ----------------------
-struct UniformRandomParam {
+struct UniformRandomParam : ParamBase {
   std::vector<int64_t> shape{};
   float min{-1.0f};
   float max{1.0f};
@@ -491,12 +691,12 @@ struct UniformRandomParam {
   lite::Tensor* Out{};
 };
 /// ----------------------- negative operators --------------
-struct NegativeParam {
+struct NegativeParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 /// ----------------------- pad2d operators ----------------------
-struct Pad2dParam {
+struct Pad2dParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::vector<int> paddings{0, 0, 0, 0};
@@ -506,7 +706,7 @@ struct Pad2dParam {
 };
 
 /// ----------------------- Crop operators ----------------------
-struct CropParam {
+struct CropParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::vector<int> offsets;
@@ -514,21 +714,21 @@ struct CropParam {
 };
 
 ///----------------------- argmax operators ----------------------
-struct ArgmaxParam {
+struct ArgmaxParam : ParamBase {
   lite::Tensor* X{};
   lite::Tensor* Out{};
   int Axis{0};
 };
 
 ///----------------------- axpy operators ----------------------
-struct AxpyParam {
+struct AxpyParam : ParamBase {
   lite::Tensor* Scale{};
   lite::Tensor* X{};
   lite::Tensor* Bias{};
   lite::Tensor* Out{};
 };
 /// ----------------------- GRU unit operators ----------------------f
-struct GRUUnitParam {
+struct GRUUnitParam : ParamBase {
   enum ActType { identity, sigmoid, tanh, relu };
   const lite::Tensor* input{nullptr};
   const lite::Tensor* hidden_prev{nullptr};
@@ -544,18 +744,18 @@ struct GRUUnitParam {
 };
 
 /// ------------------------------ lrn operators ------------------------------
-struct LrnParam {
+struct LrnParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   int n{5};
-  float alpha{1e-4};
-  float beta{0.75};
-  float k{1.};
+  float alpha{1e-4f};
+  float beta{0.75f};
+  float k{1.f};
   std::string norm_region{"AcrossChannels"};
 };
 
 /// ----------------------- decode_bboxes operators ----------------------
-struct DecodeBboxesParam {
+struct DecodeBboxesParam : ParamBase {
   const lite::Tensor* loc_data{};
   const lite::Tensor* prior_data{};
   lite::Tensor* bbox_data{};
@@ -571,7 +771,7 @@ struct DecodeBboxesParam {
 };
 
 /// ----------------------- box_coder operators ----------------------
-struct BoxCoderParam {
+struct BoxCoderParam : ParamBase {
   const lite::Tensor* prior_box{};
   const lite::Tensor* prior_box_var{};
   const lite::Tensor* target_box{};
@@ -584,7 +784,7 @@ struct BoxCoderParam {
 };
 
 /// ----------------------- multiclass_nms operators ----------------------
-struct MulticlassNmsParam {
+struct MulticlassNmsParam : ParamBase {
   const lite::Tensor* bboxes{};
   const lite::Tensor* scores{};
   lite::Tensor* out{};
@@ -592,14 +792,14 @@ struct MulticlassNmsParam {
   int background_label{0};
   float score_threshold{};
   int nms_top_k{};
-  float nms_threshold{0.3};
-  float nms_eta{1.0};
+  float nms_threshold{0.3f};
+  float nms_eta{1.0f};
   int keep_top_k;
   bool normalized{true};
 };
 
 /// ----------------------- priorbox operators ----------------------
-struct PriorBoxParam {
+struct PriorBoxParam : ParamBase {
   lite::Tensor* input{};
   lite::Tensor* image{};
   lite::Tensor* boxes{};
@@ -628,7 +828,7 @@ struct DensityPriorBoxParam : public PriorBoxParam {
   std::vector<int> density_sizes;
 };
 /// ----------------------- GRU operators ----------------------f
-struct GRUParam {
+struct GRUParam : ParamBase {
   const lite::Tensor* input{nullptr};
   const lite::Tensor* h0{nullptr};
   const lite::Tensor* weight{nullptr};
@@ -645,7 +845,7 @@ struct GRUParam {
 };
 
 /// ----------------------- BeamSearchDecode operators ----------------------f
-struct BeamSearchDecodeParam {
+struct BeamSearchDecodeParam : ParamBase {
   std::vector<lite::Tensor>* ids{nullptr};
   std::vector<lite::Tensor>* scores{nullptr};
   lite::Tensor* sentence_ids{nullptr};
@@ -655,21 +855,21 @@ struct BeamSearchDecodeParam {
 };
 
 /// ----------------------- LookupTable operators ----------------------f
-struct LookupTableParam {
+struct LookupTableParam : ParamBase {
   const lite::Tensor* W{nullptr};
   const lite::Tensor* Ids{nullptr};
   lite::Tensor* Out{nullptr};
   int64_t padding_idx{-1};
 };
 
-struct LookupTableDequantParam {
+struct LookupTableDequantParam : ParamBase {
   lite::Tensor* W{nullptr};
   lite::Tensor* Ids{nullptr};
   lite::Tensor* Out{nullptr};
   int64_t padding_idx{-1};
 };
 
-struct Im2SequenceParam {
+struct Im2SequenceParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
@@ -679,19 +879,34 @@ struct Im2SequenceParam {
   std::vector<int> out_strides{1, 1};
 };
 
-struct SequenceSoftmaxParam {
+struct SequenceSoftmaxParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
-};
-
-struct NormParam {
+  ///////////////////////////////////////////////////////////////////////////////////
+  //  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
+};
+
+struct NormParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   lite::Tensor* Norm{};
   int axis{1};
-  float epsilon{1e-10};
+  float epsilon{1e-10f};
 };
-struct LayerNormParam {
+struct LayerNormParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Scale{};
   const lite::Tensor* Bias{};
@@ -699,16 +914,16 @@ struct LayerNormParam {
   lite::Tensor* Mean{};
   lite::Tensor* Variance{};
   int begin_norm_axis{1};
-  float epsilon{1e-5};
+  float epsilon{1e-5f};
 };
 
-struct LogicalParam {
+struct LogicalParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
 };
 
-struct CompareParam {
+struct CompareParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   bool force_cpu{0};
@@ -716,7 +931,7 @@ struct CompareParam {
   lite::Tensor* Out{};
 };
 
-struct WhileParam {
+struct WhileParam : ParamBase {
   Scope* scope{};
   Tensor* cond{};
   cpp::BlockDesc* sub_block{};
@@ -724,32 +939,32 @@ struct WhileParam {
   std::vector<Tensor*> outs{};
 };
 
-struct TopkParam {
+struct TopkParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   lite::Tensor* Indices{};
   int K{1};
 };
 
-struct IncrementParam {
+struct IncrementParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   float step{1};
 };
 
-struct WriteToArrayParam {
+struct WriteToArrayParam : ParamBase {
   const lite::Tensor* X{nullptr};
   const lite::Tensor* I{nullptr};
   std::vector<lite::Tensor>* Out{nullptr};
 };
 
-struct ReadFromArrayParam {
+struct ReadFromArrayParam : ParamBase {
   const std::vector<lite::Tensor>* X{nullptr};
   const lite::Tensor* I{nullptr};
   lite::Tensor* Out{nullptr};
 };
 
-struct BeamSearchParam {
+struct BeamSearchParam : ParamBase {
   const lite::Tensor* pre_ids{};
   const lite::Tensor* pre_scores{};
   const lite::Tensor* ids{};
@@ -763,7 +978,7 @@ struct BeamSearchParam {
   bool is_accumulated;
 };
 
-struct SequencePoolParam {
+struct SequencePoolParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::string pool_type{"AVERAGE"};
@@ -773,7 +988,7 @@ struct SequencePoolParam {
 #endif
 };
 
-struct SequenceConvParam {
+struct SequenceConvParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Filter{};
   lite::Tensor* Out{};
@@ -782,13 +997,13 @@ struct SequenceConvParam {
   int contextLength;
 };
 
-struct SequencePoolConcatParam {
+struct SequencePoolConcatParam : ParamBase {
   std::vector<lite::Tensor*> X{};
   lite::Tensor* Out{};
   std::vector<std::string> pool_type{};
 };
 
-struct SearchGroupPaddingParam {
+struct SearchGroupPaddingParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* out_emb_padding{};
   lite::Tensor* out_new{};
@@ -796,36 +1011,42 @@ struct SearchGroupPaddingParam {
   int pad_id;
 };
 
-struct SequenceReshapeParam {
+struct SequenceReshapeParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* output{};
   int new_dim;
 };
 
-struct SequenceExpandParam {
+struct SequenceExpandParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
   int ref_level{-1};
 };
 
-struct SequenceExpandAsParam {
+struct SequenceUnpadParam : ParamBase {
+  const lite::Tensor* X{};
+  const lite::Tensor* Length{};
+  lite::Tensor* Out{};
+};
+
+struct SequenceExpandAsParam : ParamBase {
   const lite::Tensor* x{nullptr};
   const lite::Tensor* y{nullptr};
   lite::Tensor* out{nullptr};
 };
 
-struct SequenceReverseParam {
+struct SequenceReverseParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 
-struct SequenceConcatParam {
+struct SequenceConcatParam : ParamBase {
   std::vector<lite::Tensor*> X{};
   lite::Tensor* Out{};
 };
 
-struct AttentionPaddingMaskParam {
+struct AttentionPaddingMaskParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   int pad_id;
@@ -834,21 +1055,21 @@ struct AttentionPaddingMaskParam {
   lite::Tensor* pad_begin{};
 };
 
-struct SequenceArithmeticParam {
+struct SequenceArithmeticParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   int op_type{1};
   lite::Tensor* Out{};
 };
 
-struct ReduceMaxParam {
+struct ReduceMaxParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::vector<int> dim{};
   bool keep_dim{false};
 };
 
-struct LodResetParam {
+struct LodResetParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
@@ -856,12 +1077,12 @@ struct LodResetParam {
   bool append;
 };
 
-struct IsEmptyParam {
+struct IsEmptyParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 
-struct ReduceParam {
+struct ReduceParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* output{};
   std::vector<int> dim{0};
@@ -869,7 +1090,7 @@ struct ReduceParam {
   bool reduce_all{false};
 };
 
-struct VarConv2DParam {
+struct VarConv2DParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* ROW{};
   const lite::Tensor* COLUMN{};
@@ -888,19 +1109,19 @@ struct VarConv2DParam {
 };
 
 /// ----------------------- shape operators ----------------------
-struct ShapeParam {
+struct ShapeParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 
-struct CastParam {
+struct CastParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   int out_dtype{2};
   int in_dtype{2};
 };
 
-struct SliceParam {
+struct SliceParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::vector<int> axes{};
@@ -912,9 +1133,24 @@ struct SliceParam {
   std::vector<lite::Tensor*> EndsTensorList{};
   lite::Tensor* StartsTensor{nullptr};
   lite::Tensor* EndsTensor{nullptr};
-};
-
-struct AffineChannelParam {
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
+};
+
+struct AffineChannelParam : ParamBase {
   const lite::Tensor* X{};  // X is 4D tensor
   const lite::Tensor* Scale{};
   const lite::Tensor* Bias{};
@@ -922,19 +1158,19 @@ struct AffineChannelParam {
   lite::Tensor* Out{};
 };
 
-struct AnchorGeneratorParam {
+struct AnchorGeneratorParam : ParamBase {
   const lite::Tensor* Input{};
   std::vector<float> anchor_sizes{};
   std::vector<float> aspect_ratios{};
   std::vector<float> stride{};
-  std::vector<float> variances{{0.1, 0.1, 0.2, 0.2}};
-  float offset{0.5};
+  std::vector<float> variances{{0.1f, 0.1f, 0.2f, 0.2f}};
+  float offset{0.5f};
 
   lite::Tensor* Anchors{};
   lite::Tensor* Variances{};
 };
 
-struct GenerateProposalsParam {
+struct GenerateProposalsParam : ParamBase {
   // inputs
   const lite::Tensor* Scores{};
   const lite::Tensor* BboxDeltas{};
@@ -945,62 +1181,107 @@ struct GenerateProposalsParam {
   // attrs
   int pre_nms_topN{6000};
   int post_nms_topN{1000};
-  float nms_thresh{0.5};
-  float min_size{0.1};
-  float eta{1.0};
+  float nms_thresh{0.5f};
+  float min_size{0.1f};
+  float eta{1.0f};
 
   // outputs
   lite::Tensor* RpnRois{};
   lite::Tensor* RpnRoiProbs{};
 };
 /// ----------------------- squeeze operators ----------------------
-struct SqueezeParam {
+struct SqueezeParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   lite::Tensor* XShape{};
   std::vector<int> axes{};
-};
-
-struct UnsqueezeParam {
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
+};
+
+struct UnsqueezeParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   lite::Tensor* XShape{};
   std::vector<int> axes{};
   const lite::Tensor* axes_tensor{};
   std::vector<const lite::Tensor*> axes_tensor_vct{};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 /// ----------------------- expand operators ----------------------
-struct ExpandParam {
+struct ExpandParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::vector<int> expand_times{};
 };
 
 /// ----------------------- matmul operators ----------------------
-struct MatMulParam {
+struct MatMulParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
   bool transpose_X{false};
   bool transpose_Y{false};
   float alpha{1.0f};
-};
-
-struct GatherParam {
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X, Y}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
+};
+
+struct GatherParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Index{};
   lite::Tensor* Out{};
 };
 
 /// ----------------------- assign operators -----------------------
-struct AssignParam {
+struct AssignParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 
 /// ----------------------- roi_align operators -----------------------
-struct RoiAlignParam {
+struct RoiAlignParam : ParamBase {
   lite::Tensor* X{};
   lite::Tensor* ROIs{};
   lite::Tensor* Out{};
@@ -1011,13 +1292,13 @@ struct RoiAlignParam {
 };
 
 /// ----------------------- box_clip operators -----------------------
-struct BoxClipParam {
+struct BoxClipParam : ParamBase {
   const lite::Tensor* Input{};
   const lite::Tensor* ImInfo{};
   lite::Tensor* Output{};
 };
 
-struct RangeParam {
+struct RangeParam : ParamBase {
   const lite::Tensor* Start;
   const lite::Tensor* End;
   const lite::Tensor* Step;
@@ -1025,7 +1306,7 @@ struct RangeParam {
 };
 
 /// ----------------------- assign_value operators -----------------------
-struct AssignValueParam {
+struct AssignValueParam : ParamBase {
   std::vector<int> shape{};
   int dtype{};
   std::vector<float> fp32_values{};
@@ -1034,7 +1315,7 @@ struct AssignValueParam {
 };
 
 /// --------------- sequence_topk_avg_pooling operators ------------------
-struct SequenceTopkAvgPoolingParam {
+struct SequenceTopkAvgPoolingParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* ROW{};
   const lite::Tensor* COLUMN{};
@@ -1045,7 +1326,7 @@ struct SequenceTopkAvgPoolingParam {
 };
 
 /// --------------- search_fc operators ------------------
-struct SearchFcParam {
+struct SearchFcParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* W{};
   const lite::Tensor* b{};
@@ -1053,7 +1334,7 @@ struct SearchFcParam {
   int out_size{};
 };
 /// --------------------- match_matrix_tensor operators --------------------
-struct MatchMatrixTensorParam {
+struct MatchMatrixTensorParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* y{};
   const lite::Tensor* w{};
@@ -1064,14 +1345,14 @@ struct MatchMatrixTensorParam {
 };
 
 /// --------------------- search_seq_depadding operators --------------------
-struct SearchSeqDepaddingParam {
+struct SearchSeqDepaddingParam : ParamBase {
   const lite::Tensor* pad{};
   const lite::Tensor* src{};
   lite::Tensor* out{};
 };
 
 /// --------------------- search_grnn operators --------------------
-struct SearchGrnnParam {
+struct SearchGrnnParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* wi{};
   const lite::Tensor* wh{};
@@ -1084,7 +1365,7 @@ struct SearchGrnnParam {
   lite::Tensor* layout_input{};
 };
 
-struct SplitLodTensorParam {
+struct SplitLodTensorParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* mask{};
   lite::Tensor* out_true{};
@@ -1092,7 +1373,7 @@ struct SplitLodTensorParam {
   int level{};
 };
 
-struct MergeLodTensorParam {
+struct MergeLodTensorParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* mask{};
   const lite::Tensor* in_true{};
@@ -1101,7 +1382,7 @@ struct MergeLodTensorParam {
   int level{};
 };
 
-struct ConditionalBlockParam {
+struct ConditionalBlockParam : ParamBase {
   const lite::Tensor* cond{};
   std::vector<lite::Tensor*> x{};
   std::vector<lite::Tensor*> outs{};
@@ -1110,14 +1391,14 @@ struct ConditionalBlockParam {
   bool is_scalar_condition{};
 };
 
-struct CollectFpnProposalsParam {
+struct CollectFpnProposalsParam : ParamBase {
   std::vector<lite::Tensor*> multi_level_rois{};
   std::vector<lite::Tensor*> multi_level_scores{};
   lite::Tensor* fpn_rois{};
   int post_nms_topN{};
 };
 
-struct DistributeFpnProposalsParam {
+struct DistributeFpnProposalsParam : ParamBase {
   const lite::Tensor* fpn_rois{};
   std::vector<lite::Tensor*> multi_fpn_rois{};
   lite::Tensor* restore_index{};
@@ -1128,7 +1409,7 @@ struct DistributeFpnProposalsParam {
 };
 
 /// --------------------- instance_norm operators --------------------
-struct InstanceNormParam {
+struct InstanceNormParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* out{};
   lite::Tensor* bias{};
@@ -1138,12 +1419,12 @@ struct InstanceNormParam {
   float epsilon;
 };
 /// --------------------- grid sampler operators --------------------
-struct GridSamplerParam {
+struct GridSamplerParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* out{};
   lite::Tensor* grid{};
 };
-struct LstmParam {
+struct LstmParam : ParamBase {
   lite::Tensor* Input{};
   lite::Tensor* Weight{};
   lite::Tensor* Bias{};
@@ -1160,7 +1441,7 @@ struct LstmParam {
   std::string candidate_activation;
 };
 
-struct CrfDecodingParam {
+struct CrfDecodingParam : ParamBase {
   lite::Tensor* emission{};
   lite::Tensor* transition{};
   lite::Tensor* label{};
@@ -1168,6 +1449,40 @@ struct CrfDecodingParam {
   lite::Tensor* viterbi_path{};
 };
 
+struct CtcAlignParam : ParamBase {
+  lite::Tensor* input{};
+  lite::Tensor* input_length{};
+  lite::Tensor* output{};
+  lite::Tensor* output_length{};
+  int blank{0};
+  bool merge_repeated{true};
+  int padding_value{0};
+};
+
+struct XPUResNet50Param : ParamBase {
+  lite::Tensor* input{};
+  std::vector<lite::Tensor*> filter;
+  std::vector<lite::Tensor*> bias;
+  std::vector<lite::Tensor*> max_filter;
+  lite::Tensor* output{};
+};
+
+struct XPUMultiEncoderParam : ParamBase {
+  lite::Tensor* input{};
+  std::vector<lite::Tensor*> fc_weight;
+  std::vector<lite::Tensor*> fc_bias;
+  std::vector<lite::Tensor*> ln_scale;
+  std::vector<lite::Tensor*> ln_bias;
+  lite::Tensor* fc_weight_max{};
+  lite::Tensor* mask{};
+  lite::Tensor* output{};
+
+  int n_layers{};
+  int head_num{};
+  int size_per_head{};
+  std::string act_type{};
+};
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/pad2d_op.cc b/lite/operators/pad2d_op.cc
index ff522b94b95091b6df6d4d2f71e18907c5118619..7af657c888f9b1b28a1b273a193be59e2ace895c 100644
--- a/lite/operators/pad2d_op.cc
+++ b/lite/operators/pad2d_op.cc
@@ -30,7 +30,7 @@ bool Pad2dOpLite::CheckShape() const {
   return true;
 }
 
-bool Pad2dOpLite::InferShape() const {
+bool Pad2dOpLite::InferShapeImpl() const {
   // nchw
   auto x_dims = param_.X->dims();
   int out_h = x_dims[2] + param_.paddings[0] + param_.paddings[1];
diff --git a/lite/operators/pad2d_op.h b/lite/operators/pad2d_op.h
index c51a76a7aef5624b1480fd1b1cdf56bf23c63674..c6d2e565483655c6279af8318434f129ec92a5e5 100644
--- a/lite/operators/pad2d_op.h
+++ b/lite/operators/pad2d_op.h
@@ -30,7 +30,7 @@ class Pad2dOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/pool_op.cc b/lite/operators/pool_op.cc
index c6f6eed28f8cdb5f080b6f4367a1b88b1dbc0701..5fb990928ec1ae723bc12b695af1be5e50da5079 100644
--- a/lite/operators/pool_op.cc
+++ b/lite/operators/pool_op.cc
@@ -60,7 +60,7 @@ int PoolOutputSize(int input_size,
   return output_size;
 }
 
-bool PoolOpLite::InferShape() const {
+bool PoolOpLite::InferShapeImpl() const {
   const auto x_dims = param_.x->dims();
   std::vector<int>& ksize = param_.ksize;
   // dynamic update 4-pad
diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h
index c44875ff95b554ca92cf5288597a5bdaf2cb1bf8..3fcf37e6348628d489e9a2097e2c8dac7eba3e3c 100644
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -37,7 +37,7 @@ class PoolOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
diff --git a/lite/operators/power_op.cc b/lite/operators/power_op.cc
index 578d95ad53ffe0481288934a7a04d0f9e4442440..83c9edfaca1505746640280633bf6d47cddc6146 100644
--- a/lite/operators/power_op.cc
+++ b/lite/operators/power_op.cc
@@ -27,7 +27,7 @@ bool PowerOp::CheckShape() const {
   return true;
 }
 
-bool PowerOp::InferShape() const {
+bool PowerOp::InferShapeImpl() const {
   param_.Out->Resize(param_.X->dims());
   return true;
 }
diff --git a/lite/operators/power_op.h b/lite/operators/power_op.h
index a6d43f4394a8d3a2141f32e1fb633aef8c8227f8..e89dfa7b8f682e029bfba1059fda9c17340c420b 100644
--- a/lite/operators/power_op.h
+++ b/lite/operators/power_op.h
@@ -31,7 +31,7 @@ class PowerOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/prior_box_op.cc b/lite/operators/prior_box_op.cc
index c4717c8185b24cfd9f6a551dcb932dc325a502d2..f1b715a46e1378f805d91312cc7804cb4097ec02 100644
--- a/lite/operators/prior_box_op.cc
+++ b/lite/operators/prior_box_op.cc
@@ -27,7 +27,7 @@ bool PriorBoxOpLite::CheckShape() const {
   return true;
 }
 
-bool PriorBoxOpLite::InferShape() const { return true; }
+bool PriorBoxOpLite::InferShapeImpl() const { return true; }
 
 bool PriorBoxOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   auto input = opdesc.Input("Input").front();
diff --git a/lite/operators/prior_box_op.h b/lite/operators/prior_box_op.h
index a393e80315eab07cc8558da8c26d6acad8cc76c1..1348b7cc73f6b731453584ef455813fe0d1cf8be 100644
--- a/lite/operators/prior_box_op.h
+++ b/lite/operators/prior_box_op.h
@@ -29,7 +29,7 @@ class PriorBoxOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/range_op.cc b/lite/operators/range_op.cc
index a179d8ffe7abc1665b13f7d0dfeaa8b3c18cf1d5..19f474ba43b15153a7e2cca38f5ff9b097b41342 100644
--- a/lite/operators/range_op.cc
+++ b/lite/operators/range_op.cc
@@ -41,7 +41,7 @@ void GetSize(T start, T end, T step, int64_t* size) {
               : std::ceil(std::abs((end - start) / step));
 }
 
-bool RangeOpLite::InferShape() const {
+bool RangeOpLite::InferShapeImpl() const {
   int start = param_.Start->data<float>()[0];
   int end = param_.End->data<float>()[0];
   int step = param_.Step->data<float>()[0];
diff --git a/lite/operators/range_op.h b/lite/operators/range_op.h
index a1c7d4d4cc43d72001ac3519cb1c4f85ab8196ff..982ef5abf25aac816c00da918147bac8933424a9 100644
--- a/lite/operators/range_op.h
+++ b/lite/operators/range_op.h
@@ -29,7 +29,7 @@ class RangeOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/read_from_array_op.cc b/lite/operators/read_from_array_op.cc
index 930eff1ff5ff100c085a4fdb6bdf3a032d44c14b..495fd752c90da528e474b7aa726c65fd6e66c123 100644
--- a/lite/operators/read_from_array_op.cc
+++ b/lite/operators/read_from_array_op.cc
@@ -26,7 +26,7 @@ bool ReadFromArrayOp::CheckShape() const {
   return true;
 }
 
-bool ReadFromArrayOp::InferShape() const {
+bool ReadFromArrayOp::InferShapeImpl() const {
   int id = param_.I->data<int64_t>()[0];
   auto out_dims = (*param_.X)[id].dims();
   param_.Out->Resize(out_dims);
diff --git a/lite/operators/read_from_array_op.h b/lite/operators/read_from_array_op.h
index 5c7ba1468f59e27a273b368014c707676c48e36a..299a3abaedcf3618f5e28a9636d427961a97b931 100644
--- a/lite/operators/read_from_array_op.h
+++ b/lite/operators/read_from_array_op.h
@@ -30,7 +30,7 @@ class ReadFromArrayOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/reduce_max_op.cc b/lite/operators/reduce_max_op.cc
index d7d90ee1f454556baee1a87cfd0023f8cf8c119d..ba48acd11f3517f33b020ede92e07cfadc5d497b 100644
--- a/lite/operators/reduce_max_op.cc
+++ b/lite/operators/reduce_max_op.cc
@@ -39,7 +39,7 @@ bool ReduceMaxOp::CheckShape() const {
   return true;
 }
 
-bool ReduceMaxOp::InferShape() const {
+bool ReduceMaxOp::InferShapeImpl() const {
   auto dims = param_.dim;
   auto x_dims = param_.X->dims();
   bool reduce_all = false;
diff --git a/lite/operators/reduce_max_op.h b/lite/operators/reduce_max_op.h
index 60e263f1b9b72a31c223cc60f89a7ddf81949e8c..54b136a7576fb2bb078c5bcae727b15d319bdf8e 100644
--- a/lite/operators/reduce_max_op.h
+++ b/lite/operators/reduce_max_op.h
@@ -28,7 +28,7 @@ class ReduceMaxOp : public OpLite {
   ReduceMaxOp() {}
   explicit ReduceMaxOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
diff --git a/lite/operators/reduce_mean_op.cc b/lite/operators/reduce_mean_op.cc
index bce31c315c22e93d7758a05ecf2ace0668dd0cc1..c5baca5e87068d267ada21854b7769bf2bc19461 100644
--- a/lite/operators/reduce_mean_op.cc
+++ b/lite/operators/reduce_mean_op.cc
@@ -39,7 +39,7 @@ bool ReduceMeanOp::CheckShape() const {
   return true;
 }
 
-bool ReduceMeanOp::InferShape() const {
+bool ReduceMeanOp::InferShapeImpl() const {
   auto dims = param_.dim;
   auto x_dims = param_.X->dims();
   bool reduce_all = false;
diff --git a/lite/operators/reduce_mean_op.h b/lite/operators/reduce_mean_op.h
index e701a1132aa1260b5f169f89dec546a0d80fc916..43fe955690b3e4569f75c88a4d7b9ba9e961fcca 100644
--- a/lite/operators/reduce_mean_op.h
+++ b/lite/operators/reduce_mean_op.h
@@ -28,7 +28,7 @@ class ReduceMeanOp : public OpLite {
   ReduceMeanOp() {}
   explicit ReduceMeanOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
diff --git a/lite/operators/reduce_ops.cc b/lite/operators/reduce_ops.cc
index e2cc56b416dd166e6b22a0c642907844ab964cc5..1af6daf8c73e8e41f69be8f8af8f485ac767d702 100644
--- a/lite/operators/reduce_ops.cc
+++ b/lite/operators/reduce_ops.cc
@@ -28,7 +28,7 @@ bool ReduceOp::CheckShape() const {
   return true;
 }
 
-bool ReduceOp::InferShape() const {
+bool ReduceOp::InferShapeImpl() const {
   const auto &x_dims = param_.x->dims();
   auto x_rank = x_dims.size();
   auto dims = param_.dim;
diff --git a/lite/operators/reduce_ops.h b/lite/operators/reduce_ops.h
index 0063aba1fa606c6228e7dcb1197bfb36f57aa33c..d4fdbd113586a57b0d5a1e6e5fbde6707efb7cc1 100644
--- a/lite/operators/reduce_ops.h
+++ b/lite/operators/reduce_ops.h
@@ -30,7 +30,7 @@ class ReduceOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/reduce_prod_op.cc b/lite/operators/reduce_prod_op.cc
index 90da13c8643fa030c376ca25cb3a67b70f3485a4..5a6194b36b9c0b4a95fb47049999da093f979e3b 100644
--- a/lite/operators/reduce_prod_op.cc
+++ b/lite/operators/reduce_prod_op.cc
@@ -28,7 +28,7 @@ bool ReduceProdOpLite::CheckShape() const {
   return true;
 }
 
-bool ReduceProdOpLite::InferShape() const {
+bool ReduceProdOpLite::InferShapeImpl() const {
   auto x = param_.x;
   auto out = param_.output;
   std::vector<int> dim = param_.dim;
diff --git a/lite/operators/reduce_prod_op.h b/lite/operators/reduce_prod_op.h
index 5f7a6dcdf98eb99d9145b7e3108972f4debeaeb5..d8bb1400b9aecf449499d4c6920c2ef88eb119b2 100644
--- a/lite/operators/reduce_prod_op.h
+++ b/lite/operators/reduce_prod_op.h
@@ -29,7 +29,7 @@ class ReduceProdOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/relu_op.cc b/lite/operators/relu_op.cc
index 9fa3ac8f30784b8349788dfd4eaf39252db1a156..e5f51676c69bcde6b68a9e9d17f936874a5ea86f 100644
--- a/lite/operators/relu_op.cc
+++ b/lite/operators/relu_op.cc
@@ -20,7 +20,7 @@ namespace lite {
 namespace operators {
 
 bool ReluOp::CheckShape() const { return true; }
-bool ReluOp::InferShape() const {
+bool ReluOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.X);
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
diff --git a/lite/operators/relu_op.h b/lite/operators/relu_op.h
index 23ca7ff16b48de747069f006cddbb9504e6942e3..7577f2ffbab62298138b22970c00caf9ab01367f 100644
--- a/lite/operators/relu_op.h
+++ b/lite/operators/relu_op.h
@@ -30,7 +30,7 @@ class ReluOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/reshape_op.cc b/lite/operators/reshape_op.cc
index 655ac58bdcbfc0f8d9cdbb0ef0078db5eb0333fa..5cbdd8edc31d7d45ed81176397c9b003d1e346ae 100644
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
@@ -26,7 +26,7 @@ bool ReshapeOp::CheckShape() const {
   return true;
 }
 
-bool ReshapeOp::InferShape() const {
+bool ReshapeOp::InferShapeImpl() const {
   const auto &shape_tensor_vct = param_.shape_tensor_vct;
   auto *shape_tensor = param_.shape_tensor;
   const auto &shape_vct = param_.shape_vct;
@@ -37,7 +37,7 @@ bool ReshapeOp::InferShape() const {
     for (size_t i = 0; i < shape_tensor_vct.size(); i++) {
       final_shape[i] = shape_tensor_vct[i]->data<int>()[0];
     }
-  } else if (shape_tensor != nullptr) {
+  } else if (shape_tensor != nullptr && shape_tensor->data<int>() != nullptr) {
     auto *shape_tensor_data = shape_tensor->data<int>();
     final_shape = std::vector<int>(shape_tensor_data,
                                    shape_tensor_data + shape_tensor->numel());
@@ -97,8 +97,8 @@ bool Reshape2Op::CheckShape() const {
   return true;
 }
 
-bool Reshape2Op::InferShape() const {
-  ReshapeOp::InferShape();
+bool Reshape2Op::InferShapeImpl() const {
+  ReshapeOp::InferShapeImpl();
   const auto &x_dims = param_.x->dims();
   std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1);
   xshape_dims[0] = 0;
diff --git a/lite/operators/reshape_op.h b/lite/operators/reshape_op.h
index 1df49fb5f44c88978b78f17885a5ba4412aa9ab7..9dc302ec9706512b16cd9e7db38b944d2d1324f5 100644
--- a/lite/operators/reshape_op.h
+++ b/lite/operators/reshape_op.h
@@ -30,7 +30,7 @@ class ReshapeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -48,7 +48,7 @@ class Reshape2Op : public ReshapeOp {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/roi_align_op.cc b/lite/operators/roi_align_op.cc
index 2f65c0197ecf1324678c63b6bd16018f83389702..001934dcf8f77527666c1b5cc0a01afcade2af81 100644
--- a/lite/operators/roi_align_op.cc
+++ b/lite/operators/roi_align_op.cc
@@ -38,7 +38,7 @@ bool RoiAlignOpLite::CheckShape() const {
   return true;
 }
 
-bool RoiAlignOpLite::InferShape() const {
+bool RoiAlignOpLite::InferShapeImpl() const {
   auto x_dims = param_.X->dims();
   auto rois_dims = param_.ROIs->dims();
 
diff --git a/lite/operators/roi_align_op.h b/lite/operators/roi_align_op.h
index f3dd1a47f5e2d0dbb39439c9789573b9b7a33728..65cc72534a2e2b63a1e024a55c766f2c1983f5ab 100644
--- a/lite/operators/roi_align_op.h
+++ b/lite/operators/roi_align_op.h
@@ -31,7 +31,7 @@ class RoiAlignOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/scale_op.cc b/lite/operators/scale_op.cc
index 1398ea481194cae545fc8f1fa803eff5f5b78a31..3236277187462dd1185e698e5cb8fe919fe20b97 100644
--- a/lite/operators/scale_op.cc
+++ b/lite/operators/scale_op.cc
@@ -24,7 +24,7 @@ bool ScaleOp::CheckShape() const {
   return true;
 }
 
-bool ScaleOp::InferShape() const {
+bool ScaleOp::InferShapeImpl() const {
   param_.output->Resize(param_.x->dims());
   return true;
 }
diff --git a/lite/operators/scale_op.h b/lite/operators/scale_op.h
index 684da4ed47370090c5cb690ea728fa4f9147c4bf..38970bfcfd82eebce51612e6afb531cbf3b10966 100644
--- a/lite/operators/scale_op.h
+++ b/lite/operators/scale_op.h
@@ -30,7 +30,7 @@ class ScaleOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/search_aligned_mat_mul_op.cc b/lite/operators/search_aligned_mat_mul_op.cc
index 43a276e3c7a2f7481ade2ee18c1446593f7c5f43..65ccbc2b793cb3a64c16a5b3bf7d869d8e271327 100644
--- a/lite/operators/search_aligned_mat_mul_op.cc
+++ b/lite/operators/search_aligned_mat_mul_op.cc
@@ -27,7 +27,7 @@ bool SearchAlignedMatMulOpLite::CheckShape() const {
   return true;
 }
 
-bool SearchAlignedMatMulOpLite::InferShape() const {
+bool SearchAlignedMatMulOpLite::InferShapeImpl() const {
   const auto x_dims = param_.X->dims();
   const auto y_dims = param_.Y->dims();
   const auto& x_lod = param_.X->lod();
diff --git a/lite/operators/search_aligned_mat_mul_op.h b/lite/operators/search_aligned_mat_mul_op.h
index 7321b7e9d15331e6aad36364436a99d3d4089c8c..8242e06d0170a8a4c178f0e460c64f93b0c2bc3c 100644
--- a/lite/operators/search_aligned_mat_mul_op.h
+++ b/lite/operators/search_aligned_mat_mul_op.h
@@ -31,7 +31,7 @@ class SearchAlignedMatMulOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc
index 2e77e361624e681aa93e36610674df0e1f9a13af..3c64f24e48f750b367b75431333401329721a9b9 100644
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
@@ -50,7 +50,7 @@ bool SearchFcOpLite::CheckShape() const {
   return true;
 }
 
-bool SearchFcOpLite::InferShape() const {
+bool SearchFcOpLite::InferShapeImpl() const {
   auto out_size = param_.out_size;
   lite::DDim dims(std::vector<int64_t>({-1, out_size}));
   param_.Out->Resize(dims);
diff --git a/lite/operators/search_fc_op.h b/lite/operators/search_fc_op.h
index a871cadd33b4f7d4b6130a0b8ac2974a738ac0c3..235c24c57ff0e925d763fa11a78f56cfe72613cd 100644
--- a/lite/operators/search_fc_op.h
+++ b/lite/operators/search_fc_op.h
@@ -30,7 +30,7 @@ class SearchFcOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/search_grnn_op.cc b/lite/operators/search_grnn_op.cc
index b56ae820bf9de4ffe6aa3f6db7a8e1385c8cc11f..1ced477c109d8cd93485f0193523887759939f17 100644
--- a/lite/operators/search_grnn_op.cc
+++ b/lite/operators/search_grnn_op.cc
@@ -51,7 +51,7 @@ bool SearchGrnnOpLite::CheckShape() const {
   return true;
 }
 
-bool SearchGrnnOpLite::InferShape() const {
+bool SearchGrnnOpLite::InferShapeImpl() const {
   const auto& x_dims = param_.x->dims();
   const auto& x_lod = param_.x->lod();
   CHECK_OR_FALSE(!x_lod.empty());
diff --git a/lite/operators/search_grnn_op.h b/lite/operators/search_grnn_op.h
index 670af8a6c9ff9eafa33018a0303ea1a36b0a1e01..de4b1d8a5c4d551970fcbb7b0c17de67214b5c9a 100644
--- a/lite/operators/search_grnn_op.h
+++ b/lite/operators/search_grnn_op.h
@@ -31,7 +31,7 @@ class SearchGrnnOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/search_group_padding_op.cc b/lite/operators/search_group_padding_op.cc
index 5ba4dde275f4b9662416bdf5190cacfafc56a40d..b97c710109ea9eb1ae3b1e50e3bdab3e1e97ac3e 100644
--- a/lite/operators/search_group_padding_op.cc
+++ b/lite/operators/search_group_padding_op.cc
@@ -31,7 +31,7 @@ bool SearchGroupPaddingOp::CheckShape() const {
   return true;
 }
 
-bool SearchGroupPaddingOp::InferShape() const {
+bool SearchGroupPaddingOp::InferShapeImpl() const {
   std::vector<int64_t> x_dims = param_.x->dims().Vectorize();
 
   param_.out_emb_padding->Resize({-1, x_dims[1]});
diff --git a/lite/operators/search_group_padding_op.h b/lite/operators/search_group_padding_op.h
index a8e96c9697b5f7de70349efa1f8b378a47c3823c..6a93c7410128aa86b034308562b8c3ccd4ca78df 100644
--- a/lite/operators/search_group_padding_op.h
+++ b/lite/operators/search_group_padding_op.h
@@ -27,7 +27,7 @@ class SearchGroupPaddingOp : public OpLite {
   SearchGroupPaddingOp() {}
   explicit SearchGroupPaddingOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   std::string DebugString() const override { return "search_group_padding"; }
diff --git a/lite/operators/search_seq_depadding_op.cc b/lite/operators/search_seq_depadding_op.cc
index 12d5123e05b41665550fb7e6b90a636093959263..6ad4f1ab171486468bf34b8341344410ed99f59b 100644
--- a/lite/operators/search_seq_depadding_op.cc
+++ b/lite/operators/search_seq_depadding_op.cc
@@ -44,7 +44,7 @@ bool SearchSeqDepaddingOpLite::CheckShape() const {
   return true;
 }
 
-bool SearchSeqDepaddingOpLite::InferShape() const {
+bool SearchSeqDepaddingOpLite::InferShapeImpl() const {
   DDim pad_dims = param_.pad->dims();
   param_.out->Resize({-1, pad_dims[1]});
   return true;
diff --git a/lite/operators/search_seq_depadding_op.h b/lite/operators/search_seq_depadding_op.h
index 445d9e0f3bcba6204243e80023d826bf53d90c60..aa1cc22d4b048ca81445e735e09226b7dfe2fd03 100644
--- a/lite/operators/search_seq_depadding_op.h
+++ b/lite/operators/search_seq_depadding_op.h
@@ -32,7 +32,7 @@ class SearchSeqDepaddingOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/search_seq_fc_op.cc b/lite/operators/search_seq_fc_op.cc
index c5cca5331ab80479656b1212df02c20d463a3707..2a4525ac6e6f7e0cdd62a0a653e7188b274545af 100644
--- a/lite/operators/search_seq_fc_op.cc
+++ b/lite/operators/search_seq_fc_op.cc
@@ -26,7 +26,7 @@ bool SearchSeqFcOpLite::CheckShape() const {
   return true;
 }
 
-bool SearchSeqFcOpLite::InferShape() const {
+bool SearchSeqFcOpLite::InferShapeImpl() const {
   const auto x_dims = param_.x->dims();
   const auto w_dims = param_.w->dims();
   const auto& x_lod = param_.x->lod();
diff --git a/lite/operators/search_seq_fc_op.h b/lite/operators/search_seq_fc_op.h
index 3c4f7d82bfa66c2f323063f0297438c81ce18397..bacafcfe6ffa2a2c518cf3b8f226fa29c9b95e95 100644
--- a/lite/operators/search_seq_fc_op.h
+++ b/lite/operators/search_seq_fc_op.h
@@ -31,7 +31,7 @@ class SearchSeqFcOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/search_seq_softmax_op.cc b/lite/operators/search_seq_softmax_op.cc
index 973ffa04c4562334af6d379b5446902036de8c5e..9b0550341c50df9cd48fa922139fc759c5289e97 100644
--- a/lite/operators/search_seq_softmax_op.cc
+++ b/lite/operators/search_seq_softmax_op.cc
@@ -25,7 +25,7 @@ bool SearchSeqSoftmaxOp::CheckShape() const {
   return true;
 }
 
-bool SearchSeqSoftmaxOp::InferShape() const {
+bool SearchSeqSoftmaxOp::InferShapeImpl() const {
   param_.output->Resize(param_.x->dims());
   param_.output->set_lod(param_.x->lod());
   return true;
diff --git a/lite/operators/search_seq_softmax_op.h b/lite/operators/search_seq_softmax_op.h
index f97e8ddd3a6c446fb5c53d5e603f43bbdf1e2525..dca3619eab9013f22d962b16c577c73862ee5e64 100644
--- a/lite/operators/search_seq_softmax_op.h
+++ b/lite/operators/search_seq_softmax_op.h
@@ -31,7 +31,7 @@ class SearchSeqSoftmaxOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_arithmetic_op.cc b/lite/operators/sequence_arithmetic_op.cc
index 29c39ebc23f54c2c3c052e322575d97570195cfc..e17a179a860e13622979e5b42b07ae3459876fc7 100644
--- a/lite/operators/sequence_arithmetic_op.cc
+++ b/lite/operators/sequence_arithmetic_op.cc
@@ -28,7 +28,7 @@ bool SequenceArithmeticOp::CheckShape() const {
   return true;
 }
 
-bool SequenceArithmeticOp::InferShape() const {
+bool SequenceArithmeticOp::InferShapeImpl() const {
   param_.Out->Resize(param_.X->dims());
   param_.Out->set_lod(param_.X->lod());
   return true;
diff --git a/lite/operators/sequence_arithmetic_op.h b/lite/operators/sequence_arithmetic_op.h
index 9f844dfbf429599d829bc786c66ba6d05e40d79d..cf9ef1583aeaed977c515441ca629b2e66efb3d2 100644
--- a/lite/operators/sequence_arithmetic_op.h
+++ b/lite/operators/sequence_arithmetic_op.h
@@ -29,7 +29,7 @@ class SequenceArithmeticOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_concat_op.cc b/lite/operators/sequence_concat_op.cc
index 88afe5e00fe2bfc173a8a1d1d0e63562cfb52518..91c70c0d2ff2d506d29dbeb01780de962f9a27f1 100644
--- a/lite/operators/sequence_concat_op.cc
+++ b/lite/operators/sequence_concat_op.cc
@@ -26,7 +26,7 @@ bool SequenceConcatOp::CheckShape() const {
   return true;
 }
 
-bool SequenceConcatOp::InferShape() const { return true; }
+bool SequenceConcatOp::InferShapeImpl() const { return true; }
 
 bool SequenceConcatOp::AttachImpl(const cpp::OpDesc &opdesc,
                                   lite::Scope *scope) {
diff --git a/lite/operators/sequence_concat_op.h b/lite/operators/sequence_concat_op.h
index 8cdc07ebca83b9c400b00a0f40556a788c5854e6..c7d61db7852fb8894c5c4ed7c3d4283480c90e48 100644
--- a/lite/operators/sequence_concat_op.h
+++ b/lite/operators/sequence_concat_op.h
@@ -27,7 +27,7 @@ class SequenceConcatOp : public OpLite {
   SequenceConcatOp() {}
   explicit SequenceConcatOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   std::string DebugString() const override { return "sequence_concat"; }
diff --git a/lite/operators/sequence_conv_op.cc b/lite/operators/sequence_conv_op.cc
index 89596a22c616b45d0e72cc14501e4f6c148ad86c..681e05c9b69953c4dde6c873e66bee2e93839aaf 100644
--- a/lite/operators/sequence_conv_op.cc
+++ b/lite/operators/sequence_conv_op.cc
@@ -44,7 +44,7 @@ bool SequenceConvOp::CheckShape() const {
   return true;
 }
 
-bool SequenceConvOp::InferShape() const {
+bool SequenceConvOp::InferShapeImpl() const {
   const auto *input = param_.X;
   const auto *filter = param_.Filter;
   auto in_dims = input->dims();
diff --git a/lite/operators/sequence_conv_op.h b/lite/operators/sequence_conv_op.h
index 34d65d3cc9324aea7b50a1d939a594b817889896..3ec7ac4d3da7822335e047ca1c681809914c192b 100644
--- a/lite/operators/sequence_conv_op.h
+++ b/lite/operators/sequence_conv_op.h
@@ -28,7 +28,7 @@ class SequenceConvOp : public OpLite {
   SequenceConvOp() {}
   explicit SequenceConvOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
diff --git a/lite/operators/sequence_expand_as_op.cc b/lite/operators/sequence_expand_as_op.cc
index 22a4743103fd4b188357d067a062ea827de7aaa0..02c787b5a51749851de1484101a6339142bc9726 100644
--- a/lite/operators/sequence_expand_as_op.cc
+++ b/lite/operators/sequence_expand_as_op.cc
@@ -34,7 +34,7 @@ bool SequenceExpandAsOpLite::CheckShape() const {
   return true;
 }
 
-bool SequenceExpandAsOpLite::InferShape() const {
+bool SequenceExpandAsOpLite::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
   auto y_lod = param_.y->lod();
   auto out_dims = x_dims;
diff --git a/lite/operators/sequence_expand_as_op.h b/lite/operators/sequence_expand_as_op.h
index 2eae8a26da31eb2937ab88f15d70bd44515e6a5f..19d6905c1a428ce4ac8b2cdb545f194bf47ee62d 100644
--- a/lite/operators/sequence_expand_as_op.h
+++ b/lite/operators/sequence_expand_as_op.h
@@ -31,7 +31,7 @@ class SequenceExpandAsOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_expand_op.cc b/lite/operators/sequence_expand_op.cc
index 0a5427a62ffca44070c9551a4f1c869ae184f0be..4bb3c66b26673a27a961729d6fe22d54ef9298fe 100644
--- a/lite/operators/sequence_expand_op.cc
+++ b/lite/operators/sequence_expand_op.cc
@@ -40,7 +40,7 @@ bool SequenceExpandOp::CheckShape() const {
   return true;
 }
 
-bool SequenceExpandOp::InferShape() const {
+bool SequenceExpandOp::InferShapeImpl() const {
   const auto x_lod = param_.X->lod();
   auto x_dims = param_.X->dims();
   int ref_level = param_.ref_level;
diff --git a/lite/operators/sequence_expand_op.h b/lite/operators/sequence_expand_op.h
index da4b2fe71edb7f731bf53872960612e16efbef93..fffe2110d871941522e5924943be764e3ee51db5 100644
--- a/lite/operators/sequence_expand_op.h
+++ b/lite/operators/sequence_expand_op.h
@@ -30,7 +30,7 @@ class SequenceExpandOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_pool_concat_op.cc b/lite/operators/sequence_pool_concat_op.cc
index 9ee0d4d5967e0d36bb893b42033f2c5319c940bb..ce490e8246c621cb23b3a3eecc0e8ddc4bca28b1 100644
--- a/lite/operators/sequence_pool_concat_op.cc
+++ b/lite/operators/sequence_pool_concat_op.cc
@@ -26,7 +26,7 @@ bool SequencePoolConcatOp::CheckShape() const {
   return true;
 }
 
-bool SequencePoolConcatOp::InferShape() const {
+bool SequencePoolConcatOp::InferShapeImpl() const {
   int out_dim = 0;
   for (int i = 0; i < param_.X.size(); ++i) {
     out_dim += param_.X[i]->dims().count(1, param_.X[i]->dims().size());
diff --git a/lite/operators/sequence_pool_concat_op.h b/lite/operators/sequence_pool_concat_op.h
index 7a70ceaf298ebd7d02c319b08a86f40dc36cb648..58e6fc18ba49f6885e1f4ffb86cba47ca86f9623 100644
--- a/lite/operators/sequence_pool_concat_op.h
+++ b/lite/operators/sequence_pool_concat_op.h
@@ -28,7 +28,7 @@ class SequencePoolConcatOp : public OpLite {
   SequencePoolConcatOp() {}
   explicit SequencePoolConcatOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
diff --git a/lite/operators/sequence_pool_op.cc b/lite/operators/sequence_pool_op.cc
index be3726ffe7a73c50f92bec2f2a96fb1625e31a9e..6b4f7d8b789f11c815b86f7dcc990e6db7855bbd 100644
--- a/lite/operators/sequence_pool_op.cc
+++ b/lite/operators/sequence_pool_op.cc
@@ -29,7 +29,7 @@ bool SequencePoolOp::CheckShape() const {
   return true;
 }
 
-bool SequencePoolOp::InferShape() const {
+bool SequencePoolOp::InferShapeImpl() const {
   const auto *input = param_.X;
   auto out_dims = input->dims();
   out_dims[0] = input->lod()[0].size() - 1;
diff --git a/lite/operators/sequence_pool_op.h b/lite/operators/sequence_pool_op.h
index 215dd113a3e5d9cdb1707a9b1b70c5712a43ec5d..7b9e36bb5e6e5f47cf49b1bd0df62795b7d57b7e 100644
--- a/lite/operators/sequence_pool_op.h
+++ b/lite/operators/sequence_pool_op.h
@@ -28,7 +28,7 @@ class SequencePoolOp : public OpLite {
   SequencePoolOp() {}
   explicit SequencePoolOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
diff --git a/lite/operators/sequence_reshape_op.cc b/lite/operators/sequence_reshape_op.cc
index c7e86af65033205bcb389cecff8db14721507142..37ebd8a2bae3919062bc0e71e3a10193850e7877 100644
--- a/lite/operators/sequence_reshape_op.cc
+++ b/lite/operators/sequence_reshape_op.cc
@@ -27,7 +27,7 @@ bool SequenceReshapeOp::CheckShape() const {
   return true;
 }
 
-bool SequenceReshapeOp::InferShape() const {
+bool SequenceReshapeOp::InferShapeImpl() const {
   int new_dim = param_.new_dim;
   auto x_numel = param_.x->dims().production();
   std::vector<int64_t> out_shape{x_numel / new_dim,
diff --git a/lite/operators/sequence_reshape_op.h b/lite/operators/sequence_reshape_op.h
index c8378aebc44acf22017eee17f5b58d6ff4dd65bf..4ef395bdaa762d178e925f088c5c2becd357f669 100644
--- a/lite/operators/sequence_reshape_op.h
+++ b/lite/operators/sequence_reshape_op.h
@@ -31,7 +31,7 @@ class SequenceReshapeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_reverse_op.cc b/lite/operators/sequence_reverse_op.cc
index dd8fa2e8fd5816cc92355c9c73caf1aa76baf36c..19a47cac9da666269fc5ef2a172ff0295b71e95d 100644
--- a/lite/operators/sequence_reverse_op.cc
+++ b/lite/operators/sequence_reverse_op.cc
@@ -30,7 +30,7 @@ bool SequenceReverseOp::CheckShape() const {
   return true;
 }
 
-bool SequenceReverseOp::InferShape() const {
+bool SequenceReverseOp::InferShapeImpl() const {
   const auto *input = param_.X;
   auto out_dims = input->dims();
   param_.Out->Resize(out_dims);
diff --git a/lite/operators/sequence_reverse_op.h b/lite/operators/sequence_reverse_op.h
index 326d0f68927199e9353a5bbe8c072d342c9e3d69..68d9fdb0f16cf0b2e13b7ed7417572a7b971e785 100644
--- a/lite/operators/sequence_reverse_op.h
+++ b/lite/operators/sequence_reverse_op.h
@@ -27,7 +27,7 @@ class SequenceReverseOp : public OpLite {
   SequenceReverseOp() {}
   explicit SequenceReverseOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   std::string DebugString() const override { return "sequence_reverse"; }
diff --git a/lite/operators/sequence_softmax_op.cc b/lite/operators/sequence_softmax_op.cc
index d106097ed5c2e3a712bbd87904164ccd612d1f9e..eb1821129d8b036a252fb36ab69094c8a58cce95 100644
--- a/lite/operators/sequence_softmax_op.cc
+++ b/lite/operators/sequence_softmax_op.cc
@@ -24,7 +24,7 @@ bool SequenceSoftmaxOp::CheckShape() const {
   CHECK_OR_FALSE(param_.Out);
   return true;
 }
-bool SequenceSoftmaxOp::InferShape() const {
+bool SequenceSoftmaxOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto input_dims = param_.X->dims();
diff --git a/lite/operators/sequence_softmax_op.h b/lite/operators/sequence_softmax_op.h
index 37dfc0d444be5c608c87c2418041237d4ac4643c..5942cb0441d7af7237c7761fe4ccd5d613321c87 100644
--- a/lite/operators/sequence_softmax_op.h
+++ b/lite/operators/sequence_softmax_op.h
@@ -30,7 +30,7 @@ class SequenceSoftmaxOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_topk_avg_pooling_op.cc b/lite/operators/sequence_topk_avg_pooling_op.cc
index 6f5cbeeeee5816132d2ebcb7094949189931b931..cb6f12c4b33bfc04beae2574ca384fcd77ac5004 100644
--- a/lite/operators/sequence_topk_avg_pooling_op.cc
+++ b/lite/operators/sequence_topk_avg_pooling_op.cc
@@ -43,7 +43,7 @@ bool SequenceTopkAvgPoolingOpLite::CheckShape() const {
   return true;
 }
 
-bool SequenceTopkAvgPoolingOpLite::InferShape() const {
+bool SequenceTopkAvgPoolingOpLite::InferShapeImpl() const {
   int channel_num = param_.channel_num;
   std::vector<int> topks = param_.topks;
   auto row_dim = param_.ROW->dims();
diff --git a/lite/operators/sequence_topk_avg_pooling_op.h b/lite/operators/sequence_topk_avg_pooling_op.h
index 1c1cfe3a9c7bc82c3e79fc372b98293183509dca..a619edc908a5e4d4a8db97a931acb2ce24e39008 100644
--- a/lite/operators/sequence_topk_avg_pooling_op.h
+++ b/lite/operators/sequence_topk_avg_pooling_op.h
@@ -31,7 +31,7 @@ class SequenceTopkAvgPoolingOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_unpad_op.cc b/lite/operators/sequence_unpad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b91d43c741f002b2bdb30e161688cd40b462faee
--- /dev/null
+++ b/lite/operators/sequence_unpad_op.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_unpad_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceUnpadOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Length);
+  CHECK_OR_FALSE(param_.Out);
+  auto x_dims = param_.X->dims();
+  auto len_dims = param_.Length->dims();
+  CHECK(x_dims.size() >= 2) << "Rank of X can't be less than 2";
+  CHECK(len_dims.size() == 1) << "Rank of Length should be 1";
+  CHECK(x_dims[0] == len_dims[0])
+      << "X and Length should have the same 1st dim";
+  return true;
+}
+
+bool SequenceUnpadOp::InferShapeImpl() const {
+  auto x_dims = param_.X->dims();
+  auto len_dims = param_.Length->dims();
+
+  auto *seq_len_ptr = param_.Length->data<int64_t>();
+  int64_t batch_size = len_dims[0];
+  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
+  for (int64_t i = 0; i < batch_size; ++i) {
+    out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
+  }
+  paddle::lite::LoD out_lod;
+  out_lod.push_back(out_lod0);
+
+  int64_t out_dim0 = out_lod0.back();
+  std::vector<int64_t> out_dims{out_dim0};
+  if (x_dims.size() == 2) {
+    out_dims.push_back(1);
+  } else {
+    for (size_t i = 2; i < x_dims.size(); ++i) {
+      out_dims.push_back(x_dims[i]);
+    }
+  }
+  param_.Out->Resize(out_dims);
+  param_.Out->set_lod(out_lod);
+  return true;
+}
+
+bool SequenceUnpadOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                 lite::Scope *scope) {
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  param_.Length = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("Length").front())->Get<lite::Tensor>());
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_unpad, paddle::lite::operators::SequenceUnpadOp);
diff --git a/lite/operators/sequence_unpad_op.h b/lite/operators/sequence_unpad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..508f0437fe32f9b65716f78124df377b99b1ef49
--- /dev/null
+++ b/lite/operators/sequence_unpad_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceUnpadOp : public OpLite {
+ public:
+  SequenceUnpadOp() {}
+  explicit SequenceUnpadOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_unpad"; }
+
+ private:
+  mutable SequenceUnpadParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sgd_op.cc b/lite/operators/sgd_op.cc
index 621454259548d27f9dad23f01e1e392b007bcb5b..eb8cb6b72473310ca1df12e8510d74cc3d76f4aa 100644
--- a/lite/operators/sgd_op.cc
+++ b/lite/operators/sgd_op.cc
@@ -30,7 +30,7 @@ bool SGDOpLite::CheckShape() const {
   return true;
 }
 
-bool SGDOpLite::InferShape() const {
+bool SGDOpLite::InferShapeImpl() const {
   param_.ParamOut->Resize(param_.Param->dims());
   return true;
 }
diff --git a/lite/operators/sgd_op.h b/lite/operators/sgd_op.h
index 9159bf95a6a50b5cd7b5d0ffed15e06f8d0e11c5..6a29c8bfa61b455e2257600975e851860e8797cc 100644
--- a/lite/operators/sgd_op.h
+++ b/lite/operators/sgd_op.h
@@ -33,7 +33,7 @@ class SGDOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/shape_op.cc b/lite/operators/shape_op.cc
index c6d5dc4d01a93dd4cc648358db0b6f462a116eb0..a373918c6def26f1bb6adaacdb3e54598c5d9ab8 100644
--- a/lite/operators/shape_op.cc
+++ b/lite/operators/shape_op.cc
@@ -25,10 +25,9 @@ bool ShapeOpLite::CheckShape() const {
   return true;
 }
 
-bool ShapeOpLite::InferShape() const {
-  std::vector<int64_t> shape_vec;
-  shape_vec.push_back(static_cast<int64_t>(param_.X->dims().size()));
-  param_.Out->Resize(shape_vec);
+bool ShapeOpLite::InferShapeImpl() const {
+  int64_t x_dims_size = param_.X->dims().size();
+  param_.Out->Resize({x_dims_size});
   return true;
 }
 
diff --git a/lite/operators/shape_op.h b/lite/operators/shape_op.h
index ada9961c75b1cbc6c91d94a4ed3473ca12d8dcd6..6512b8ac0213519b068a10a74fdcb9d715d73255 100644
--- a/lite/operators/shape_op.h
+++ b/lite/operators/shape_op.h
@@ -28,7 +28,7 @@ class ShapeOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/shuffle_channel_op.cc b/lite/operators/shuffle_channel_op.cc
index 926aa932f3d278945b659b6113df6479c7515e20..d45643a3d82d9177f7719908ea572258e0029bef 100644
--- a/lite/operators/shuffle_channel_op.cc
+++ b/lite/operators/shuffle_channel_op.cc
@@ -27,7 +27,7 @@ bool ShuffleChannelOpLite::CheckShape() const {
   return true;
 }
 
-bool ShuffleChannelOpLite::InferShape() const {
+bool ShuffleChannelOpLite::InferShapeImpl() const {
   param_.Out->Resize(param_.X->dims());
   return true;
 }
diff --git a/lite/operators/shuffle_channel_op.h b/lite/operators/shuffle_channel_op.h
index c48a47f61902087cecf874ee7ddee8313a3cf92a..768345898141dd869c6a59f69170559d68a9f498 100644
--- a/lite/operators/shuffle_channel_op.h
+++ b/lite/operators/shuffle_channel_op.h
@@ -33,7 +33,7 @@ class ShuffleChannelOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/slice_op.cc b/lite/operators/slice_op.cc
index bbc3d1429e202dac7b9a53c00d83ee34de7ef3d1..cf7d94535cce5fa32d0f917c9d39e4746cee1c30 100644
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
@@ -27,7 +27,7 @@ bool SliceOp::CheckShape() const {
   return true;
 }
 
-bool SliceOp::InferShape() const {
+bool SliceOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto in_dims = param_.X->dims();
diff --git a/lite/operators/slice_op.h b/lite/operators/slice_op.h
index 936a1405f46ffd9e3375da1cd57b0570b07fcbbf..ec69f23d8ded4a7435bec0a2bd1f838603c7a7be 100644
--- a/lite/operators/slice_op.h
+++ b/lite/operators/slice_op.h
@@ -30,7 +30,7 @@ class SliceOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/softmax_op.cc b/lite/operators/softmax_op.cc
index 0989c9139763a435d67deb21a2ab233e1c2f3bd9..000953007c27e37bc05d85d810880f6ccd7728ce 100644
--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
@@ -29,35 +29,7 @@ bool SoftmaxOp::CheckShape() const {
   return true;
 }
 
-bool SoftmaxOp::SmartInferShape() {
-  if (!last_input_shapes.empty() && !last_output_shapes.empty()) {
-    if (param_.x->dims() == last_input_shapes[0] &&
-        param_.x->lod() == last_input_lods[0]) {
-      param_.output->Resize(last_output_shapes[0]);
-      param_.output->set_lod(last_output_lods[0]);
-      return true;
-    }
-  }
-
-  this->InferShape();
-
-  if (!last_input_shapes.empty()) {
-    last_input_shapes.clear();
-    last_input_lods.clear();
-  }
-  last_input_shapes.push_back(param_.x->dims());
-  last_input_lods.push_back(param_.x->lod());
-
-  if (!last_output_shapes.empty()) {
-    last_output_shapes.clear();
-    last_output_lods.clear();
-  }
-  last_output_shapes.push_back(param_.output->dims());
-  last_output_lods.push_back(param_.output->lod());
-  return true;
-}
-
-bool SoftmaxOp::InferShape() const {
+bool SoftmaxOp::InferShapeImpl() const {
   param_.output->Resize(param_.x->dims());
   auto out_lod = param_.output->mutable_lod();
   *out_lod = param_.x->lod();
diff --git a/lite/operators/softmax_op.h b/lite/operators/softmax_op.h
index c65d039fda02c5396eff829bede3b4ffdeac0051..20dc2f461e4f83e0b363d44e07c4204c656f2cf3 100644
--- a/lite/operators/softmax_op.h
+++ b/lite/operators/softmax_op.h
@@ -30,8 +30,7 @@ class SoftmaxOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
-  bool SmartInferShape() override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/split_lod_tensor_op.cc b/lite/operators/split_lod_tensor_op.cc
index 9b665b6026a44caa31b89ec7806188f90f5f1595..2900c8165dba3b8f0b83ef288c89ed0e56b4820d 100644
--- a/lite/operators/split_lod_tensor_op.cc
+++ b/lite/operators/split_lod_tensor_op.cc
@@ -33,7 +33,7 @@ bool SplitLodTensorOpLite::CheckShape() const {
   return true;
 }
 
-bool SplitLodTensorOpLite::InferShape() const {
+bool SplitLodTensorOpLite::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
   param_.out_true->Resize(x_dims);
   param_.out_false->Resize(x_dims);
diff --git a/lite/operators/split_lod_tensor_op.h b/lite/operators/split_lod_tensor_op.h
index c7feef4f85df652d0c24f830076a078e20c111f9..fb7f85de5cae69d3c0844ee0eeabe98d45acde4a 100644
--- a/lite/operators/split_lod_tensor_op.h
+++ b/lite/operators/split_lod_tensor_op.h
@@ -31,7 +31,7 @@ class SplitLodTensorOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc
index 834d68a3156700605e621a1ba71faec33fb7b745..71deb5631dd3523ebb0367b7db5e4049b785be7b 100644
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
@@ -29,7 +29,7 @@ bool SplitOp::CheckShape() const {
   return true;
 }
 
-bool SplitOp::InferShape() const {
+bool SplitOp::InferShapeImpl() const {
   const auto &outs = param_.output;
   auto in_dims = param_.x->dims();
   int axis = param_.axis;
diff --git a/lite/operators/split_op.h b/lite/operators/split_op.h
index 66190742155a8268e510d5a8da47ab958a043418..3bb40a8d35e25145057d8c5790b25028ea571cd5 100644
--- a/lite/operators/split_op.h
+++ b/lite/operators/split_op.h
@@ -30,7 +30,7 @@ class SplitOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/squeeze_op.cc b/lite/operators/squeeze_op.cc
index 01f96c28ff6be38e426030aa3c580f28f73b3a38..633a6b4d4e45fd30bd72c8dcdfbbd96b8a8e8ebe 100644
--- a/lite/operators/squeeze_op.cc
+++ b/lite/operators/squeeze_op.cc
@@ -75,7 +75,7 @@ bool SqueezeOp::CheckShape() const {
   return true;
 }
 
-bool SqueezeOp::InferShape() const {
+bool SqueezeOp::InferShapeImpl() const {
   std::vector<int> squeeze_dims = param_.axes;
   DDim in_dims = param_.X->dims();
   DDim out_dim = GetOutputShape(squeeze_dims, in_dims, true);
@@ -105,8 +105,8 @@ bool Squeeze2Op::CheckShape() const {
   return true;
 }
 
-bool Squeeze2Op::InferShape() const {
-  SqueezeOp::InferShape();
+bool Squeeze2Op::InferShapeImpl() const {
+  SqueezeOp::InferShapeImpl();
   auto x_dims = param_.X->dims();
   std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 1);
   for (size_t i = 0; i < x_dims.size(); i++) {
diff --git a/lite/operators/squeeze_op.h b/lite/operators/squeeze_op.h
index 1a550c5fbee59d43170b5ffa16caa81521c14d87..983e17acf6483da9e3e33c83b48e6e61455a4914 100644
--- a/lite/operators/squeeze_op.h
+++ b/lite/operators/squeeze_op.h
@@ -30,7 +30,7 @@ class SqueezeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -48,7 +48,7 @@ class Squeeze2Op : public SqueezeOp {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/stack_op.cc b/lite/operators/stack_op.cc
index 8fdf61e8224aa06792bdbb3f41a4f1701039d8dd..d4fb71c4b5cb429d1b3961d5c65f739af56ff39d 100644
--- a/lite/operators/stack_op.cc
+++ b/lite/operators/stack_op.cc
@@ -32,7 +32,7 @@ bool StackOp::CheckShape() const {
   return true;
 }
 
-bool StackOp::InferShape() const {
+bool StackOp::InferShapeImpl() const {
   auto input = param_.X;
   auto input_dims = input[0]->dims();
   int axis = param_.axis;
@@ -47,6 +47,7 @@ bool StackOp::InferShape() const {
 bool StackOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
   auto X = op_desc.Input("X");
   auto Out = op_desc.Output("Y").front();
+  param_.X.clear();
   for (auto var : X) {
     param_.X.emplace_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
   }
diff --git a/lite/operators/stack_op.h b/lite/operators/stack_op.h
index 068d905338bde892b44630c64d3ec43771614f2a..9ce73057a313fd4b4f96914b3e962120de11ac43 100644
--- a/lite/operators/stack_op.h
+++ b/lite/operators/stack_op.h
@@ -31,7 +31,7 @@ class StackOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/subgraph_op.cc b/lite/operators/subgraph_op.cc
index 58388669afa060d48ea4c3d674dff94c386f104a..9ac07e96334eda9f0001d33e0789f9de15c4ca67 100644
--- a/lite/operators/subgraph_op.cc
+++ b/lite/operators/subgraph_op.cc
@@ -22,7 +22,7 @@ namespace operators {
 
 bool SubgraphOp::CheckShape() const { return true; }
 
-bool SubgraphOp::InferShape() const { return CheckShape(); /* enrich me */ }
+bool SubgraphOp::InferShapeImpl() const { return CheckShape(); /* enrich me */ }
 
 bool SubgraphOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
   param_.input_names = op_desc.Input("Inputs");
diff --git a/lite/operators/subgraph_op.h b/lite/operators/subgraph_op.h
index 7f593159c8651cc18fbea17e559f62297d5022e9..edbfb922044d60165e589d389cd8cfb3b2547796 100644
--- a/lite/operators/subgraph_op.h
+++ b/lite/operators/subgraph_op.h
@@ -35,7 +35,7 @@ class SubgraphOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
 
diff --git a/lite/operators/topk_op.cc b/lite/operators/topk_op.cc
index fbfb825544870dfaf3e18d1595f2824970b7352b..4a68cbb4745473b21cc7b6c5f6c8fcef6e186e57 100644
--- a/lite/operators/topk_op.cc
+++ b/lite/operators/topk_op.cc
@@ -25,7 +25,7 @@ bool TopkOp::CheckShape() const {
   return true;
 }
 
-bool TopkOp::InferShape() const {
+bool TopkOp::InferShapeImpl() const {
   auto out_dims = param_.X->dims();
   out_dims[out_dims.size() - 1] = param_.K;
   auto out = param_.Out;
diff --git a/lite/operators/topk_op.h b/lite/operators/topk_op.h
index 037fa413ea5ce6fcb5eb04502cf232cea7e109e0..d5888e5f1800ba37f4bed61c146b6af75e3f91fc 100644
--- a/lite/operators/topk_op.h
+++ b/lite/operators/topk_op.h
@@ -30,7 +30,7 @@ class TopkOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/transpose_op.cc b/lite/operators/transpose_op.cc
index 71086b492b538e293a1f08ed7f492a46d6eb02f8..40780346d038c875a2eb96b11aff9d1c2a578a2f 100644
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
@@ -42,7 +42,7 @@ bool TransposeOp::CheckShape() const {
   return true;
 }
 
-bool TransposeOp::InferShape() const {
+bool TransposeOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.x);
   CHECK_OR_FALSE(param_.output);
   auto x_dims = param_.x->dims();
@@ -111,7 +111,7 @@ bool Transpose2Op::CheckShape() const {
   return true;
 }
 
-bool Transpose2Op::InferShape() const {
+bool Transpose2Op::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.x);
   CHECK_OR_FALSE(param_.output);
   auto x_dims = param_.x->dims();
diff --git a/lite/operators/transpose_op.h b/lite/operators/transpose_op.h
index ce352a7d82f4a9dd3899f21c252c003c1924dda6..39b75b96d858bb80a51e428b8d7f402258dd9cc1 100644
--- a/lite/operators/transpose_op.h
+++ b/lite/operators/transpose_op.h
@@ -31,7 +31,7 @@ class TransposeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -50,7 +50,7 @@ class Transpose2Op : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/uniform_random_op.cc b/lite/operators/uniform_random_op.cc
index 93e74e2b0172e8c3948925f3334b011f37bc097e..512648bfe4acf245286c9be21223520789134897 100644
--- a/lite/operators/uniform_random_op.cc
+++ b/lite/operators/uniform_random_op.cc
@@ -22,7 +22,7 @@ namespace operators {
 
 bool UniformRandomOpLite::CheckShape() const { return true; }
 
-bool UniformRandomOpLite::InferShape() const {
+bool UniformRandomOpLite::InferShapeImpl() const {
   param_.Out->Resize(param_.shape);
   return true;
 }
diff --git a/lite/operators/uniform_random_op.h b/lite/operators/uniform_random_op.h
index f7dde8882f47fc533e0d47dac99acdb431509341..a7890ea3e74afb3fd67f7ba4d1f02861a7e4ae48 100644
--- a/lite/operators/uniform_random_op.h
+++ b/lite/operators/uniform_random_op.h
@@ -33,7 +33,7 @@ class UniformRandomOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/unsqueeze_op.cc b/lite/operators/unsqueeze_op.cc
index 39b275b7b55f79f2c8daf16ab0a6acd2e76e8b48..b5ae90248abb4f2496a4dbca1c12317cf3a7d325 100644
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
@@ -62,7 +62,7 @@ bool UnsqueezeOp::CheckShape() const {
   return true;
 }
 
-bool UnsqueezeOp::InferShape() const {
+bool UnsqueezeOp::InferShapeImpl() const {
   std::vector<int> final_axes;
   auto axes = param_.axes;
   auto *axes_tensor = param_.axes_tensor;
@@ -129,8 +129,8 @@ bool Unsqueeze2Op::CheckShape() const {
   return true;
 }
 
-bool Unsqueeze2Op::InferShape() const {
-  UnsqueezeOp::InferShape();
+bool Unsqueeze2Op::InferShapeImpl() const {
+  UnsqueezeOp::InferShapeImpl();
   auto x_dims = param_.X->dims();
   std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 1);
   for (size_t i = 0; i < x_dims.size(); i++) {
diff --git a/lite/operators/unsqueeze_op.h b/lite/operators/unsqueeze_op.h
index 1e88828c6c5fdef767850909c0dae8ec65e9d1e0..5139b69c63699f041973c3cf31b38d6c7e9fa847 100644
--- a/lite/operators/unsqueeze_op.h
+++ b/lite/operators/unsqueeze_op.h
@@ -30,7 +30,7 @@ class UnsqueezeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -48,7 +48,7 @@ class Unsqueeze2Op : public UnsqueezeOp {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc
index 51f43c709990d7ac1e664336e252ed684479b783..8cf11f6465d73646ec9bf846cbe6347bdc4b9f5b 100644
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
@@ -21,7 +21,7 @@ namespace operators {
 
 bool VarConv2dOp::CheckShape() const { return true; }
 
-bool VarConv2dOp::InferShape() const { return true; }
+bool VarConv2dOp::InferShapeImpl() const { return true; }
 
 bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.X = const_cast<lite::Tensor *>(
diff --git a/lite/operators/var_conv_2d_op.h b/lite/operators/var_conv_2d_op.h
index ce6309419cc582c2f93250dd6e8e59c04a951f91..5fa492d28ec858426bea7d3d06598813d94dbbb8 100644
--- a/lite/operators/var_conv_2d_op.h
+++ b/lite/operators/var_conv_2d_op.h
@@ -27,7 +27,7 @@ class VarConv2dOp : public OpLite {
   VarConv2dOp() {}
   explicit VarConv2dOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   std::string DebugString() const override { return "var_conv_2d"; }
diff --git a/lite/operators/while_op.cc b/lite/operators/while_op.cc
index dba266af770183698680a49cb7ba4fe5dda2f5b2..1dcf9553f331ee6646ad6d93de048728a0886116 100644
--- a/lite/operators/while_op.cc
+++ b/lite/operators/while_op.cc
@@ -27,7 +27,7 @@ bool WhileOpLite::CheckShape() const {
   return true;
 }
 
-bool WhileOpLite::InferShape() const { return true; }
+bool WhileOpLite::InferShapeImpl() const { return true; }
 
 bool WhileOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
   auto inputs = op_desc.Input("X");
diff --git a/lite/operators/while_op.h b/lite/operators/while_op.h
index fcba722dbc182d0de617c3bf397a0266dc3d9cb2..94aec15a6d3eb60036bf9c2168fdbd855b84a396 100644
--- a/lite/operators/while_op.h
+++ b/lite/operators/while_op.h
@@ -30,7 +30,7 @@ class WhileOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/write_to_array_op.cc b/lite/operators/write_to_array_op.cc
index bf2d9bc4b755c5800497e895f597aff22147e34f..d2cf7b4f94513d1058c3b4f4de1ec70c8c244b7e 100644
--- a/lite/operators/write_to_array_op.cc
+++ b/lite/operators/write_to_array_op.cc
@@ -26,7 +26,7 @@ bool WriteToArrayOp::CheckShape() const {
   return true;
 }
 
-bool WriteToArrayOp::InferShape() const {
+bool WriteToArrayOp::InferShapeImpl() const {
   int id = param_.I->data<int64_t>()[0];
   if (param_.Out->size() < id + 1) {
     param_.Out->resize(id + 1);
diff --git a/lite/operators/write_to_array_op.h b/lite/operators/write_to_array_op.h
index 8c987a24509d915d2ec59b90808993abe779623e..9460b7e364047750991d03468956462497fc4cc1 100644
--- a/lite/operators/write_to_array_op.h
+++ b/lite/operators/write_to_array_op.h
@@ -30,7 +30,7 @@ class WriteToArrayOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/yolo_box_op.cc b/lite/operators/yolo_box_op.cc
index c8186d3f3182e21856919c46b83fe96a6e2bef93..0a5481a8fb01b5401734beacbc18a0bafcc48457 100644
--- a/lite/operators/yolo_box_op.cc
+++ b/lite/operators/yolo_box_op.cc
@@ -46,7 +46,7 @@ bool YoloBoxOp::CheckShape() const {
   return true;
 }
 
-bool YoloBoxOp::InferShape() const {
+bool YoloBoxOp::InferShapeImpl() const {
   auto* X = param_.X;
   auto anchors = param_.anchors;
   int anchor_num = anchors.size() / 2;
diff --git a/lite/operators/yolo_box_op.h b/lite/operators/yolo_box_op.h
index 2e2ea6d63408ca7d1a1cd7db48b82bf1ced294de..85448000f34bb1f0b768f78bb5929d1a26462043 100644
--- a/lite/operators/yolo_box_op.h
+++ b/lite/operators/yolo_box_op.h
@@ -30,7 +30,7 @@ class YoloBoxOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/tests/CMakeLists.txt b/lite/tests/CMakeLists.txt
index 0416c33a81b524b4dba1c1b406d91204cca6946d..a94a46897a8ae8415efd8edf19e216ede69f8888 100644
--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(kernels)
 add_subdirectory(math)
 add_subdirectory(cv)
+add_subdirectory(api)
diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2ef2ea8232cba4b87032c6b28272c6aa598fe4b5
--- /dev/null
+++ b/lite/tests/api/CMakeLists.txt
@@ -0,0 +1,21 @@
+if(LITE_WITH_XPU)
+    lite_cc_test(test_resnet50_lite_xpu SRCS test_resnet50_lite_xpu.cc
+      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+    lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc
+      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+    lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc
+      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+endif()
+
+if(LITE_WITH_RKNPU)
+    lite_cc_test(test_mobilenetv1_int8_rknpu SRCS test_mobilenetv1_int8_rknpu.cc
+      DEPS ${lite_model_test_DEPS} paddle_api_full
+      RKNPU_DEPS ${rknpu_kernels} ${rknpu_bridges}
+      ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
+endif()
diff --git a/lite/tests/api/test_bert_lite_xpu.cc b/lite/tests/api/test_bert_lite_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3ee9febb3f0eabd36118680beca66ace9470de4
--- /dev/null
+++ b/lite/tests/api/test_bert_lite_xpu.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename T>
+lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
+  lite::Tensor ret;
+  ret.Resize(shape);
+  T* ptr = ret.mutable_data<T>();
+  for (int i = 0; i < ret.numel(); ++i) {
+    ptr[i] = (T)1;
+  }
+  return ret;
+}
+
+TEST(Ernie, test_ernie_lite_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  int64_t batch_size = 1;
+  int64_t seq_len = 64;
+  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
+  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
+  predictor->GetInput(0)->Resize(input_shape);
+  predictor->GetInput(1)->Resize(input_shape);
+  predictor->GetInput(2)->Resize(input_shape);
+  predictor->GetInput(3)->Resize(input_shape);
+
+  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  results.emplace_back(std::vector<float>({0.278893, 0.330888, 0.39022}));
+  auto out = predictor->GetOutput(0);
+  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape()[0], 1);
+  ASSERT_EQ(out->shape()[1], 3);
+
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(
+          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/api/test_ernie_lite_xpu.cc b/lite/tests/api/test_ernie_lite_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b614fec96cbcc5d9c96653681d0e8794cf4ab8f
--- /dev/null
+++ b/lite/tests/api/test_ernie_lite_xpu.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename T>
+lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
+  lite::Tensor ret;
+  ret.Resize(shape);
+  T* ptr = ret.mutable_data<T>();
+  for (int i = 0; i < ret.numel(); ++i) {
+    ptr[i] = (T)1;
+  }
+  return ret;
+}
+
+TEST(Ernie, test_ernie_lite_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  int64_t batch_size = 1;
+  int64_t seq_len = 64;
+  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
+  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
+  predictor->GetInput(0)->Resize(input_shape);
+  predictor->GetInput(1)->Resize(input_shape);
+  predictor->GetInput(2)->Resize(input_shape);
+  predictor->GetInput(3)->Resize(input_shape);
+
+  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  results.emplace_back(std::vector<float>({0.108398}));
+  auto out = predictor->GetOutput(0);
+  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape()[0], 1);
+  ASSERT_EQ(out->shape()[1], 1);
+
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(
+          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/api/test_mobilenetv1_int8_rknpu.cc b/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8c123088b3f69560abf3555dd2e459af926426ef
--- /dev/null
+++ b/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/time.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+inline int64_t ShapeProduction(std::vector<int64_t> shape) {
+  int64_t s = 1;
+  for (int64_t dim : shape) {
+    s *= dim;
+  }
+  return s;
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    std::cerr << "[ERROR] usage: ./" << argv[0]
+              << " model_dir [thread_num] [warmup_times] [repeat_times] "
+                 "[input_data_path] [output_data_path]"
+              << std::endl;
+    return -1;
+  }
+  std::string model_dir = argv[1];
+  int thread_num = 1;
+  if (argc > 2) {
+    thread_num = atoi(argv[2]);
+  }
+  int warmup_times = 5;
+  if (argc > 3) {
+    warmup_times = atoi(argv[3]);
+  }
+  int repeat_times = 10;
+  if (argc > 4) {
+    repeat_times = atoi(argv[4]);
+  }
+  std::string input_data_path;
+  if (argc > 5) {
+    input_data_path = argv[5];
+  }
+  std::string output_data_path;
+  if (argc > 6) {
+    output_data_path = argv[6];
+  }
+  paddle::lite_api::CxxConfig config;
+  config.set_model_dir(model_dir);
+  config.set_threads(thread_num);
+  config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH);
+  config.set_valid_places(
+      {paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}});
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  std::unique_ptr<paddle::lite_api::Tensor> input_tensor(
+      std::move(predictor->GetInput(0)));
+  input_tensor->Resize({1, 3, 224, 224});
+  auto input_data = input_tensor->mutable_data<float>();
+  auto input_size = ShapeProduction(input_tensor->shape());
+  if (input_data_path.empty()) {
+    for (int i = 0; i < input_size; i++) {
+      input_data[i] = 1;
+    }
+  } else {
+    std::fstream fs(input_data_path, std::ios::in);
+    if (!fs.is_open()) {
+      std::cerr << "open input data file failed." << std::endl;
+      return -1;
+    }
+    for (int i = 0; i < input_size; i++) {
+      fs >> input_data[i];
+    }
+  }
+
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < repeat_times; ++i) {
+    predictor->Run();
+  }
+
+  std::cout << "Model: " << model_dir << ", threads num " << thread_num
+            << ", warmup times: " << warmup_times
+            << ", repeat times: " << repeat_times << ", spend "
+            << (GetCurrentUS() - start) / repeat_times / 1000.0
+            << " ms in average." << std::endl;
+
+  std::unique_ptr<const paddle::lite_api::Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto output_data = output_tensor->data<float>();
+  auto output_size = ShapeProduction(output_tensor->shape());
+  std::cout << "output data:";
+  for (int i = 0; i < output_size; i += 100) {
+    std::cout << "[" << i << "] " << output_data[i] << std::endl;
+  }
+  return 0;
+}
diff --git a/lite/tests/api/test_resnet50_lite_xpu.cc b/lite/tests/api/test_resnet50_lite_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be30369b9e187dd5d82527cb87eed405bc463ae3
--- /dev/null
+++ b/lite/tests/api/test_resnet50_lite_xpu.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+TEST(Resnet50, test_resnet50_lite_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  auto input_tensor = predictor->GetInput(0);
+  std::vector<int64_t> input_shape{1, 3, 224, 224};
+  input_tensor->Resize(input_shape);
+  auto* data = input_tensor->mutable_data<float>();
+  int input_num = 1;
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    input_num *= input_shape[i];
+  }
+  for (int i = 0; i < input_num; i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  results.emplace_back(std::vector<float>(
+      {0.000268651, 0.000174053, 0.000213181, 0.000396771, 0.000591516,
+       0.00018169,  0.000289721, 0.000855934, 0.000732185, 9.2055e-05,
+       0.000220664, 0.00235289,  0.00571265,  0.00357688,  0.00129667,
+       0.000465392, 0.000143775, 0.000211628, 0.000617144, 0.000265033}));
+  auto out = predictor->GetOutput(0);
+  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape()[0], 1);
+  ASSERT_EQ(out->shape()[1], 1000);
+
+  int step = 50;
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
+                  results[i][j],
+                  1e-5);
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index f4afe9ee3c3c0f9b325ac55a0c2c6a6454617e57..9411942f504c6c95d15b2a9af638b24cd85e3552 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -1,4 +1,4 @@
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_RKNPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
     lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -32,6 +32,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LIT
     lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_multiclass_nms_compute SRCS multiclass_nms_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_fill_constant_compute SRCS fill_constant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -60,6 +61,7 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
     # for training kernel
     if (LITE_WITH_TRAIN)
diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc
index afbf194976c6e524c05e95f9273748ed70b96277..c71eac8d4532eefd5569421807c85128746c6c8b 100644
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -36,7 +36,9 @@ enum activation_type_test {
   FLOOR,
   RSQRT,
   GELU,
-  SQUARE
+  SQUARE,
+  HARD_SWISH,
+  RECIPROCAL
 };
 
 class ActivationComputeTester : public arena::TestCase {
@@ -49,6 +51,9 @@ class ActivationComputeTester : public arena::TestCase {
   float relu_clipped_coef_ = 6.;
   std::string prelu_mode_ = "";
   float swish_beta_ = 0.;
+  float hard_swish_threshold = 6.0;
+  float hard_swish_scale = 6.0;
+  float hard_swish_offset = 3.0;
   DDim dims_{{1}};
   std::string type_ = "";
   activation_type_test act_type_ = RELU;
@@ -199,6 +204,20 @@ class ActivationComputeTester : public arena::TestCase {
         }
         break;
       }
+      case HARD_SWISH: {
+        for (int i = 0; i < dims_.production(); i++) {
+          float max_value = std::max(0.f, x_data[i] + hard_swish_offset);
+          float min_value = std::min(max_value, hard_swish_threshold);
+          output_data[i] = min_value * x_data[i] / hard_swish_scale;
+        }
+        break;
+      }
+      case RECIPROCAL: {
+        for (int i = 0; i < dims_.production(); i++) {
+          output_data[i] = 1.0 / x_data[i];
+        }
+        break;
+      }
       default:
         LOG(INFO) << "the type of activation is unknow.";
     }
@@ -221,6 +240,11 @@ class ActivationComputeTester : public arena::TestCase {
     if (act_type_ == SWISH) {
       op_desc->SetAttr("beta", swish_beta_);
     }
+    if (act_type_ == HARD_SWISH) {
+      op_desc->SetAttr("threshold", hard_swish_threshold);
+      op_desc->SetAttr("scale", hard_swish_scale);
+      op_desc->SetAttr("offset", hard_swish_offset);
+    }
   }
 
   void PrepareData() override {
@@ -425,19 +449,24 @@ TEST(Activation_swish, precision) {
 
 TEST(Activation_relu6, precision) {
   LOG(INFO) << "test relu6 op...";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
 
   for (auto dims : std::vector<std::vector<int64_t>>{
            {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
-    for (auto slope : {0.01, 0.1}) {
-      std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-          place, "def", 0.01, 6., "all", 0., DDim(dims), "relu6", RELU6));
-      arena::Arena arena(std::move(tester), place, 2e-5);
-      arena.TestPrecision();
-    }
+    std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+        place, "def", 0.01, 6., "all", 0., DDim(dims), "relu6", RELU6));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
   }
-#endif
 }
 
 TEST(Activation_log, precision) {
@@ -547,5 +576,61 @@ TEST(Activation_gelu, precision) {
   }
 }
 
+TEST(activation_hard_swish, precision) {
+  LOG(INFO) << "test hard_swish op";
+  Place place;
+  float abs_error = 2e-5;
+
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ActivationComputeTester(place,
+                                    "def",
+                                    0.01,
+                                    6.,
+                                    "all",
+                                    0.,
+                                    DDim(dims),
+                                    "hard_swish",
+                                    HARD_SWISH));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+TEST(activation_reciprocal, precision) {
+  LOG(INFO) << "test reciprocal op";
+  Place place;
+  float abs_error = 2e-5;
+
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ActivationComputeTester(place,
+                                    "def",
+                                    0.01,
+                                    6.,
+                                    "all",
+                                    0.,
+                                    DDim(dims),
+                                    "reciprocal",
+                                    RECIPROCAL));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/tests/kernels/compare_compute_test.cc b/lite/tests/kernels/compare_compute_test.cc
index abb9a9d503c1b68a9022d2347122906a4a4d5a69..fbea52ab0d160982c1f5dd8385329a822c20e8e9 100644
--- a/lite/tests/kernels/compare_compute_test.cc
+++ b/lite/tests/kernels/compare_compute_test.cc
@@ -216,7 +216,7 @@ TEST(Compare_OP_NPU, precision) {
 }
 #elif defined(LITE_WITH_ARM)
 TEST(Compare_OP_ARM, precision) {
-  Place place{TARGET(kARM)};
+  Place place{TARGET(kHost)};
   float abs_error = 1e-5;
   for (auto op : std::vector<std::string>{"equal",
                                           "not_equal",
diff --git a/lite/tests/kernels/ctc_align_compute_test.cc b/lite/tests/kernels/ctc_align_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e32012549cab42858938388857c65e14f65be099
--- /dev/null
+++ b/lite/tests/kernels/ctc_align_compute_test.cc
@@ -0,0 +1,254 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+class CtcAlignComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "input";
+  std::string input_length_ = "input_length";
+  std::string output_ = "output";
+  std::string output_length_ = "output_length";
+  std::vector<int> input_data_;
+  std::vector<int64_t> input_shape_;
+  std::vector<std::vector<uint64_t>> input_lod_;
+  std::vector<int> input_length_data_;
+  std::vector<int64_t> input_length_shape_;
+  std::vector<int> output_data_;
+  std::vector<int64_t> output_shape_;
+  std::vector<std::vector<uint64_t>> output_lod_;
+  std::vector<int> output_length_data_;
+  std::vector<int64_t> output_length_shape_;
+  int blank_;
+  bool merge_repeated_;
+  int padding_value_;
+
+ public:
+  CtcAlignComputeTester(const Place& place,
+                        const std::string& alias,
+                        const std::vector<int>& input_data,
+                        const std::vector<int64_t> input_shape,
+                        const std::vector<std::vector<uint64_t>>& input_lod,
+                        const std::vector<int>& input_length_data,
+                        const std::vector<int64_t> input_length_shape,
+                        const int blank,
+                        const bool merge_repeated,
+                        const int padding_value,
+                        const std::vector<int>& output_data,
+                        const std::vector<int64_t>& output_shape,
+                        const std::vector<std::vector<uint64_t>>& output_lod,
+                        const std::vector<int>& output_length_data,
+                        const std::vector<int64_t>& output_length_shape)
+      : TestCase(place, alias) {
+    input_data_ = input_data;
+    input_shape_ = input_shape;
+    input_lod_ = input_lod;
+    input_length_data_ = input_length_data;
+    input_length_shape_ = input_length_shape;
+    blank_ = blank;
+    merge_repeated_ = merge_repeated;
+    padding_value_ = padding_value;
+    output_data_ = output_data;
+    output_shape_ = output_shape;
+    output_lod_ = output_lod;
+    output_length_data_ = output_length_data;
+    output_length_shape_ = output_length_shape;
+  }
+
+  void RunBaseline(Scope* scope) override {
+    auto* output_tensor = scope->NewTensor(output_);
+    output_tensor->Resize(output_shape_);
+    if (!output_lod_.empty()) {
+      output_tensor->set_lod(output_lod_);
+    }
+    auto* output_data = output_tensor->mutable_data<int>();
+    int64_t output_num = 1;
+    for (auto e : output_shape_) {
+      output_num *= e;
+    }
+    for (int i = 0; i < output_num; i++) {
+      output_data[i] = output_data_[i];
+    }
+
+    if (!input_length_data_.empty() && !output_length_data_.empty()) {
+      auto* output_length_tensor = scope->NewTensor(output_length_);
+      output_length_tensor->Resize(output_length_shape_);
+      auto* output_length_data = output_length_tensor->mutable_data<int>();
+      int64_t num = 1;
+      for (auto e : output_length_shape_) {
+        num *= e;
+      }
+      for (int i = 0; i < num; i++) {
+        output_length_data[i] = output_length_data_[i];
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("ctc_align");
+    op_desc->SetInput("Input", {input_});
+    op_desc->SetOutput("Output", {output_});
+    if (!input_length_data_.empty()) {
+      op_desc->SetInput("InputLength", {input_length_});
+      op_desc->SetOutput("OutputLength", {output_length_});
+    }
+    op_desc->SetAttr("blank", blank_);
+    op_desc->SetAttr("merge_repeated", merge_repeated_);
+    op_desc->SetAttr("padding_value", padding_value_);
+  }
+
+  void PrepareData() override {
+    SetCommonTensor(input_, DDim(input_shape_), input_data_.data(), input_lod_);
+    if (!input_length_data_.empty()) {
+      SetCommonTensor(
+          input_length_, DDim(input_length_shape_), input_length_data_.data());
+    }
+  }
+};
+TEST(CtcAlign1, precision) {
+  LOG(INFO) << "test ctc_align op";
+#ifdef LITE_WITH_ARM
+  // Define variable
+  const std::vector<int>& input_data = {
+      0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0};
+  const std::vector<int64_t> input_shape = {18, 1};
+  const std::vector<std::vector<uint64_t>> input_lod = {{11, 7}};
+  const std::vector<int> input_length_data = {};
+  const std::vector<int64_t> input_length_shape = {};
+  const int blank = 0;
+  const bool merge_repeated = false;
+  const int padding_value = 0;
+  const std::vector<int> output_data = {1, 2, 2, 4, 4, 5, 6, 6, 7, 7, 7};
+  const std::vector<int64_t> output_shape = {11, 1};
+  const std::vector<std::vector<uint64_t>> output_lod = {{7, 4}};
+  const std::vector<int> output_length_data = {};
+  const std::vector<int64_t> output_length_shape = {};
+
+  // Test
+  Place place(TARGET(kHost), PRECISION(kInt32));
+  std::unique_ptr<arena::TestCase> tester(
+      new CtcAlignComputeTester(place,
+                                "def",
+                                input_data,
+                                input_shape,
+                                input_lod,
+                                input_length_data,
+                                input_length_shape,
+                                blank,
+                                merge_repeated,
+                                padding_value,
+                                output_data,
+                                output_shape,
+                                output_lod,
+                                output_length_data,
+                                output_length_shape));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+}
+
+TEST(CtcAlign2, precision) {
+  LOG(INFO) << "test ctc_align op";
+#ifdef LITE_WITH_ARM
+  // Define variable
+  const std::vector<int>& input_data = {
+      0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 0, 0, 7, 7, 7, 0, 0};
+  const std::vector<int64_t> input_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> input_lod = {};
+  const std::vector<int> input_length_data = {6, 5, 4};
+  const std::vector<int64_t> input_length_shape = {3, 1};
+  const int blank = 0;
+  const bool merge_repeated = true;
+  const int padding_value = 0;
+  const std::vector<int> output_data = {
+      1, 2, 4, 0, 0, 0, 4, 5, 6, 0, 0, 0, 7, 0, 0, 0, 0, 0};
+  const std::vector<int64_t> output_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> output_lod = {};
+  const std::vector<int> output_length_data = {3, 3, 1};
+  const std::vector<int64_t> output_length_shape = {3, 1};
+
+  // Test
+  Place place(TARGET(kHost), PRECISION(kInt32));
+  std::unique_ptr<arena::TestCase> tester(
+      new CtcAlignComputeTester(place,
+                                "def",
+                                input_data,
+                                input_shape,
+                                input_lod,
+                                input_length_data,
+                                input_length_shape,
+                                blank,
+                                merge_repeated,
+                                padding_value,
+                                output_data,
+                                output_shape,
+                                output_lod,
+                                output_length_data,
+                                output_length_shape));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+}
+
+TEST(CtcAlign3, precision) {
+  LOG(INFO) << "test ctc_align op";
+#ifdef LITE_WITH_ARM
+  // Define variable
+  const std::vector<int>& input_data = {
+      0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 0, 0, 7, 7, 7, 0, 0};
+  const std::vector<int64_t> input_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> input_lod = {};
+  const std::vector<int> input_length_data = {6, 5, 4};
+  const std::vector<int64_t> input_length_shape = {3, 1};
+  const int blank = 0;
+  const bool merge_repeated = false;
+  const int padding_value = 0;
+  const std::vector<int> output_data = {
+      1, 2, 2, 4, 0, 0, 4, 5, 6, 0, 0, 0, 7, 7, 7, 0, 0, 0};
+  const std::vector<int64_t> output_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> output_lod = {};
+  const std::vector<int> output_length_data = {4, 3, 3};
+  const std::vector<int64_t> output_length_shape = {3, 1};
+
+  // Test
+  Place place(TARGET(kHost), PRECISION(kInt32));
+  std::unique_ptr<arena::TestCase> tester(
+      new CtcAlignComputeTester(place,
+                                "def",
+                                input_data,
+                                input_shape,
+                                input_lod,
+                                input_length_data,
+                                input_length_shape,
+                                blank,
+                                merge_repeated,
+                                padding_value,
+                                output_data,
+                                output_shape,
+                                output_lod,
+                                output_length_data,
+                                output_length_shape));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+}
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc
index 750b4c42d40a52894a90700cf48838c5c9a4980c..4d0ad1ab47a17c3e8d227b9e0482d7cbe21ab7e2 100644
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
@@ -91,10 +91,12 @@ class GatherComputeTest : public arena::TestCase {
 };
 
 TEST(Gather, precision) {
-  LOG(INFO) << "test gather op";
   float abs_error = 2e-5;
   Place place;
-#if defined(LITE_WITH_ARM)
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
@@ -104,8 +106,7 @@ TEST(Gather, precision) {
 
   for (auto x_dims :
        std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
-    for (auto index_dims :
-         std::vector<std::vector<int64_t>>{{3, 1}, {7, 1}, {10, 1}}) {
+    for (auto index_dims : std::vector<std::vector<int64_t>>{{3}, {7}, {10}}) {
       std::unique_ptr<arena::TestCase> tester(
           new GatherComputeTest(place, "def", DDim(x_dims), DDim(index_dims)));
       arena::Arena arena(std::move(tester), place, abs_error);
diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc
index 8c55e31863ff7b38c51e751583d4a92b1f185d86..c4f9277d86128df808351007dda8d300da15a526 100644
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
@@ -21,6 +21,7 @@
 namespace paddle {
 namespace lite {
 
+template <typename T>
 class LookupTableComputeTest : public arena::TestCase {
  protected:
   // common attributes for this op.
@@ -64,7 +65,7 @@ class LookupTableComputeTest : public arena::TestCase {
     out->Resize(out_dims);
     out->set_lod(ids->lod());
 
-    auto ids_data = ids->data<int64_t>();
+    auto ids_data = ids->data<T>();
     auto ids_size = ids_dims.production();
     auto w_data = w->data<float>();
     auto w_rows = w_dims[0];
@@ -95,9 +96,8 @@ class LookupTableComputeTest : public arena::TestCase {
   }
 
   void PrepareData() override {
-    std::vector<int64_t> ids(ids_dims_.production());
-    fill_data_rand<int64_t>(
-        ids.data(), 0, w_dims_[0] - 1, ids_dims_.production());
+    std::vector<T> ids(ids_dims_.production());
+    fill_data_rand<T>(ids.data(), 0, w_dims_[0] - 1, ids_dims_.production());
 
     std::vector<float> w(w_dims_.production());
     fill_data_rand(w.data(), -1.f, 1.f, w_dims_.production());
@@ -109,9 +109,12 @@ class LookupTableComputeTest : public arena::TestCase {
 
 TEST(LookupTable, precision) {
   LOG(INFO) << "test lookup_table op";
-  float abs_error = 2e-5;
+  float abs_error = 1e-5;
   Place place;
-#if defined(LITE_WITH_ARM)
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;
+#elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
@@ -119,18 +122,25 @@ TEST(LookupTable, precision) {
   return;
 #endif
 
+#if defined(LITE_WITH_NPU)
+  using ID_T = int;
+#else
+  using ID_T = int64_t;
+#endif
+
   for (auto ids_dims :
        std::vector<std::vector<int64_t>>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) {
     for (auto w_dims :
          std::vector<std::vector<int64_t>>{{4, 2}, {6, 8}, {12, 15}}) {
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_XPU) && defined(LITE_WITH_NPU)
       for (auto padding_idx :
-           std::vector<int64_t>{-1}) {  // Only -1 is supported by XPU
+           std::vector<int64_t>{-1}) {  // Only -1 is supported by XPU or NPU
 #else
       for (auto padding_idx : std::vector<int64_t>{-1, 0, w_dims[0] - 1}) {
 #endif
-        std::unique_ptr<arena::TestCase> tester(new LookupTableComputeTest(
-            place, "def", DDim(ids_dims), DDim(w_dims), padding_idx));
+        std::unique_ptr<arena::TestCase> tester(
+            new LookupTableComputeTest<ID_T>(
+                place, "def", DDim(ids_dims), DDim(w_dims), padding_idx));
         arena::Arena arena(std::move(tester), place, abs_error);
         arena.TestPrecision();
       }
diff --git a/lite/tests/kernels/multiclass_nms_compute_test.cc b/lite/tests/kernels/multiclass_nms_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1190197bffdf505fec77c6b22b7871316a2d125
--- /dev/null
+++ b/lite/tests/kernels/multiclass_nms_compute_test.cc
@@ -0,0 +1,491 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static void GetMaxScoreIndex(const std::vector<T>& scores,
+                             const T threshold,
+                             int top_k,
+                             std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(),
+                   sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static T JaccardOverlap(const T* box1, const T* box2, const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
+    T inter_w = inter_xmax - inter_xmin + norm;
+    T inter_h = inter_ymax - inter_ymin + norm;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <class T>
+void SliceOneClass(const Tensor& items,
+                   const int class_id,
+                   Tensor* one_class_item) {
+  T* item_data = one_class_item->mutable_data<T>();
+  const T* items_data = items.data<T>();
+  const int64_t num_item = items.dims()[0];
+  const int64_t class_num = items.dims()[1];
+  if (items.dims().size() == 3) {
+    int64_t item_size = items.dims()[2];
+    for (int i = 0; i < num_item; ++i) {
+      std::memcpy(item_data + i * item_size,
+                  items_data + i * class_num * item_size + class_id * item_size,
+                  sizeof(T) * item_size);
+    }
+  } else {
+    for (int i = 0; i < num_item; ++i) {
+      item_data[i] = items_data[i * class_num + class_id];
+    }
+  }
+}
+
+template <typename T>
+void NMSFast(const Tensor& bbox,
+             const Tensor& scores,
+             const T score_threshold,
+             const T nms_threshold,
+             const T eta,
+             const int64_t top_k,
+             std::vector<int>* selected_indices,
+             const bool normalized) {
+  // The total boxes for each instance.
+  int64_t num_boxes = bbox.dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
+  // 16, 24, or 32: [x1 y1 x2 y2 ...  xn yn], n = 8, 12 or 16
+  int64_t box_size = bbox.dims()[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+  const T* bbox_data = bbox.data<T>();
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = T(0.);
+        // 4: [xmin ymin xmax ymax]
+        if (box_size == 4) {
+          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size,
+                                      normalized);
+        } else {
+          LOG(FATAL) << "not support";
+        }
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <typename T>
+void MultiClassNMS(const Tensor& scores,
+                   const Tensor& bboxes,
+                   const int scores_size,
+                   std::map<int, std::vector<int>>* indices,
+                   int* num_nmsed_out,
+                   int64_t background_label,
+                   int64_t nms_top_k,
+                   int64_t keep_top_k,
+                   bool normalized,
+                   T nms_threshold,
+                   T nms_eta,
+                   T score_threshold) {
+  int num_det = 0;
+
+  int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
+  Tensor bbox_slice, score_slice;
+  for (int64_t c = 0; c < class_num; ++c) {
+    if (c == background_label) continue;
+    if (scores_size == 3) {
+      score_slice = scores.Slice<T>(c, c + 1);
+      bbox_slice = bboxes;
+    } else {
+      score_slice.Resize({scores.dims()[0], 1});
+      bbox_slice.Resize({scores.dims()[0], 4});
+      SliceOneClass<T>(scores, c, &score_slice);
+      SliceOneClass<T>(bboxes, c, &bbox_slice);
+    }
+    NMSFast(bbox_slice,
+            score_slice,
+            score_threshold,
+            nms_threshold,
+            nms_eta,
+            nms_top_k,
+            &((*indices)[c]),
+            normalized);
+    if (scores_size == 2) {
+      std::stable_sort((*indices)[c].begin(), (*indices)[c].end());
+    }
+    num_det += (*indices)[c].size();
+  }
+
+  *num_nmsed_out = num_det;
+  const T* scores_data = scores.data<T>();
+  if (keep_top_k > -1 && num_det > keep_top_k) {
+    const T* sdata;
+    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+    for (const auto& it : *indices) {
+      int label = it.first;
+      if (scores_size == 3) {
+        sdata = scores_data + label * scores.dims()[1];
+      } else {
+        score_slice.Resize({scores.dims()[0], 1});
+        SliceOneClass<T>(scores, label, &score_slice);
+        sdata = score_slice.data<T>();
+      }
+      const std::vector<int>& label_indices = it.second;
+      for (size_t j = 0; j < label_indices.size(); ++j) {
+        int idx = label_indices[j];
+        score_index_pairs.push_back(
+            std::make_pair(sdata[idx], std::make_pair(label, idx)));
+      }
+    }
+    // Keep top k results per image.
+    std::stable_sort(score_index_pairs.begin(),
+                     score_index_pairs.end(),
+                     SortScorePairDescend<std::pair<int, int>>);
+    score_index_pairs.resize(keep_top_k);
+
+    // Store the new indices.
+    std::map<int, std::vector<int>> new_indices;
+    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+      int label = score_index_pairs[j].second.first;
+      int idx = score_index_pairs[j].second.second;
+      new_indices[label].push_back(idx);
+    }
+    if (scores_size == 2) {
+      for (const auto& it : new_indices) {
+        int label = it.first;
+        std::stable_sort(new_indices[label].begin(), new_indices[label].end());
+      }
+    }
+    new_indices.swap(*indices);
+    *num_nmsed_out = keep_top_k;
+  }
+}
+
+template <typename T>
+void MultiClassOutput(const Tensor& scores,
+                      const Tensor& bboxes,
+                      const std::map<int, std::vector<int>>& selected_indices,
+                      const int scores_size,
+                      Tensor* outs,
+                      int* oindices = nullptr,
+                      const int offset = 0) {
+  int64_t class_num = scores.dims()[1];
+  int64_t predict_dim = scores.dims()[1];
+  int64_t box_size = bboxes.dims()[1];
+  if (scores_size == 2) {
+    box_size = bboxes.dims()[2];
+  }
+  int64_t out_dim = box_size + 2;
+  auto* scores_data = scores.data<T>();
+  auto* bboxes_data = bboxes.data<T>();
+  auto* odata = outs->mutable_data<T>();
+  const T* sdata;
+  Tensor bbox;
+  bbox.Resize({scores.dims()[0], box_size});
+  int count = 0;
+  for (const auto& it : selected_indices) {
+    int label = it.first;
+    const std::vector<int>& indices = it.second;
+    if (scores_size == 2) {
+      SliceOneClass<T>(bboxes, label, &bbox);
+    } else {
+      sdata = scores_data + label * predict_dim;
+    }
+    for (size_t j = 0; j < indices.size(); ++j) {
+      int idx = indices[j];
+      odata[count * out_dim] = label;  // label
+      const T* bdata;
+      if (scores_size == 3) {
+        bdata = bboxes_data + idx * box_size;
+        odata[count * out_dim + 1] = sdata[idx];  // score
+        if (oindices != nullptr) {
+          oindices[count] = offset + idx;
+        }
+      } else {
+        bdata = bbox.data<T>() + idx * box_size;
+        odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
+        if (oindices != nullptr) {
+          oindices[count] = offset + idx * class_num + label;
+        }
+      }
+      // xmin, ymin, xmax, ymax or multi-points coordinates
+      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
+      count++;
+    }
+  }
+}
+
+class MulticlassNmsComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string type_ = "multiclass_nms";
+  std::string bboxes_ = "bboxes";
+  std::string scores_ = "scores";
+  std::string out_ = "out";
+  DDim bboxes_dims_{};
+  DDim scores_dims_{};
+  int keep_top_k_{2};
+  float nms_threshold_{0.45f};
+  float nms_eta_{1.f};
+  int nms_top_k_{1};
+  int background_label_{-1};
+  float score_threshold_{0.01f};
+  bool normalized_{false};
+
+ public:
+  MulticlassNmsComputeTester(const Place& place,
+                             const std::string& alias,
+                             DDim bboxes_dims,
+                             DDim scores_dims,
+                             int keep_top_k = 2,
+                             float nms_threshold = 0.45f,
+                             float nms_eta = 1.f,
+                             int nms_top_k = 1,
+                             int background_label = 1,
+                             float score_threshold = 0.01f,
+                             bool normalized = false)
+      : TestCase(place, alias),
+        bboxes_dims_(bboxes_dims),
+        scores_dims_(scores_dims),
+        keep_top_k_(keep_top_k),
+        nms_threshold_(nms_threshold),
+        nms_eta_(nms_eta),
+        nms_top_k_(nms_top_k),
+        background_label_(background_label),
+        score_threshold_(score_threshold),
+        normalized_(normalized) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* boxes = scope->FindTensor(bboxes_);
+    auto* scores = scope->FindTensor(scores_);
+    auto* outs = scope->NewTensor(out_);
+    CHECK(outs);
+    outs->set_precision(PRECISION(kFloat));
+
+    auto score_size = scores_dims_.size();
+    std::vector<std::map<int, std::vector<int>>> all_indices;
+    std::vector<uint64_t> batch_starts = {0};
+    int64_t batch_size = scores_dims_[0];
+    int64_t box_dim = bboxes_dims_[2];
+    int64_t out_dim = box_dim + 2;
+    int num_nmsed_out = 0;
+    Tensor boxes_slice, scores_slice;
+    int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+    for (int i = 0; i < n; ++i) {
+      if (score_size == 3) {
+        scores_slice = scores->Slice<float>(i, i + 1);
+        scores_slice.Resize({scores_dims_[1], scores_dims_[2]});
+        boxes_slice = boxes->Slice<float>(i, i + 1);
+        boxes_slice.Resize({scores_dims_[2], box_dim});
+      } else {
+        auto boxes_lod = boxes->lod().back();
+        scores_slice = scores->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+        boxes_slice = boxes->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+      }
+      std::map<int, std::vector<int>> indices;
+      MultiClassNMS<float>(scores_slice,
+                           boxes_slice,
+                           score_size,
+                           &indices,
+                           &num_nmsed_out,
+                           background_label_,
+                           nms_top_k_,
+                           keep_top_k_,
+                           normalized_,
+                           nms_threshold_,
+                           nms_eta_,
+                           score_threshold_);
+      all_indices.push_back(indices);
+      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+    }
+
+    uint64_t num_kept = batch_starts.back();
+    if (num_kept == 0) {
+      outs->Resize({1, 1});
+      float* od = outs->mutable_data<float>();
+      od[0] = -1;
+      batch_starts = {0, 1};
+    } else {
+      outs->Resize({static_cast<int64_t>(num_kept), out_dim});
+      outs->mutable_data<float>();
+      int offset = 0;
+      int* oindices = nullptr;
+      for (int i = 0; i < n; ++i) {
+        if (score_size == 3) {
+          scores_slice = scores->Slice<float>(i, i + 1);
+          boxes_slice = boxes->Slice<float>(i, i + 1);
+          scores_slice.Resize({scores_dims_[1], scores_dims_[2]});
+          boxes_slice.Resize({scores_dims_[2], box_dim});
+        } else {
+          auto boxes_lod = boxes->lod().back();
+          scores_slice = scores->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+          boxes_slice = boxes->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+        }
+        int64_t s = static_cast<int64_t>(batch_starts[i]);
+        int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
+        if (e > s) {
+          Tensor out = outs->Slice<float>(s, e);
+          MultiClassOutput<float>(scores_slice,
+                                  boxes_slice,
+                                  all_indices[i],
+                                  scores_dims_.size(),
+                                  &out,
+                                  oindices,
+                                  offset);
+        }
+      }
+    }
+
+    LoD lod;
+    lod.emplace_back(batch_starts);
+    outs->set_lod(lod);
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(type_);
+    op_desc->SetInput("BBoxes", {bboxes_});
+    op_desc->SetInput("Scores", {scores_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("keep_top_k", keep_top_k_);
+    op_desc->SetAttr("nms_threshold", nms_threshold_);
+    op_desc->SetAttr("nms_eta", nms_eta_);
+    op_desc->SetAttr("nms_top_k", nms_top_k_);
+    op_desc->SetAttr("background_label", background_label_);
+    op_desc->SetAttr("score_threshold", score_threshold_);
+    op_desc->SetAttr("normalized", normalized_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> bboxes(bboxes_dims_.production());
+    for (int i = 0; i < bboxes_dims_.production(); ++i) {
+      bboxes[i] = i * 1. / bboxes_dims_.production();
+    }
+    SetCommonTensor(bboxes_, bboxes_dims_, bboxes.data());
+
+    std::vector<float> scores(scores_dims_.production());
+    for (int i = 0; i < scores_dims_.production(); ++i) {
+      scores[i] = i * 1. / scores_dims_.production();
+    }
+    SetCommonTensor(scores_, scores_dims_, scores.data());
+  }
+};
+
+void TestMulticlassNms(Place place, float abs_error) {
+  int N = 3;
+  int M = 2500;
+  for (int class_num : {2, 4, 10}) {
+    std::vector<int64_t> bbox_shape{N, M, 4};
+    std::vector<int64_t> score_shape{N, class_num, M};
+    std::unique_ptr<arena::TestCase> tester(new MulticlassNmsComputeTester(
+        place, "def", DDim(bbox_shape), DDim(score_shape)));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+TEST(multiclass_nms, precision) {
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  TestMulticlassNms(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/pool_compute_test.cc b/lite/tests/kernels/pool_compute_test.cc
index 988c99bf7c0adb246ea7b7408054485aaf59dce8..04894188b0bf1557000479ae18b0369997909f89 100644
--- a/lite/tests/kernels/pool_compute_test.cc
+++ b/lite/tests/kernels/pool_compute_test.cc
@@ -276,9 +276,24 @@ void TestPoolHelper(Place place,
                     std::string pooling_type,
                     std::vector<int> strides,
                     std::vector<int> paddings,
-                    std::vector<int> ksize) {
-  std::unique_ptr<arena::TestCase> tester(new PoolComputeTest(
-      place, "def", DDim(dims), pooling_type, false, strides, paddings, ksize));
+                    std::vector<int> ksize,
+                    bool exclusive = true,
+                    bool ceil_mode = false,
+                    bool adaptive = false,
+                    std::string padding_algorithm = "") {
+  std::unique_ptr<arena::TestCase> tester(
+      new PoolComputeTest(place,
+                          "def",
+                          DDim(dims),
+                          pooling_type,
+                          false,
+                          strides,
+                          paddings,
+                          ksize,
+                          exclusive,
+                          ceil_mode,
+                          adaptive,
+                          padding_algorithm));
   arena::Arena arena(std::move(tester), place, abs_error);
   arena.TestPrecision();
 }
@@ -345,6 +360,20 @@ void TestPoolKsize(Place place, float abs_error = 2e-5) {
   }
 }
 
+void TestPoolCeilMode(Place place, float abs_error = 2e-5) {
+  for (auto pooling_type : {"max", "avg"}) {
+    TestPoolHelper(place,
+                   abs_error,
+                   {2, 3, 6, 6},
+                   pooling_type,
+                   {2, 2},
+                   {0, 0, 0, 0},
+                   {3, 3},
+                   true,
+                   true);
+  }
+}
+
 TEST(Pool, precision) {
   LOG(INFO) << "test pool op";
   float abs_error = 2e-5;
@@ -363,6 +392,7 @@ TEST(Pool, precision) {
   TestPoolStrides(place, abs_error);
   TestPoolPaddings(place, abs_error);
   TestPoolKsize(place, abs_error);
+  TestPoolCeilMode(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc
index 4fba28e2ab982b1f15e48c95dfa247b2ea56c1ae..3a866b6cf22cf67c3f5a60e5a4aa8603cee6a1a3 100644
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
@@ -204,6 +204,8 @@ TEST(Reshape, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
 #elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
 #else
diff --git a/lite/tests/kernels/shape_compute_test.cc b/lite/tests/kernels/shape_compute_test.cc
index 23eab7c94f6a4a2c9b94239822ee9804fb728386..79e20736c289bdb047c44cf11f317e596a895c92 100644
--- a/lite/tests/kernels/shape_compute_test.cc
+++ b/lite/tests/kernels/shape_compute_test.cc
@@ -16,13 +16,14 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
 class ShapeComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
-  std::string x_ = "Input";
+  std::string input_ = "Input";
   std::string out_ = "Out";
   DDim dims_;
 
@@ -31,7 +32,7 @@ class ShapeComputeTester : public arena::TestCase {
       : TestCase(place, alias), dims_(dims) {}
 
   void RunBaseline(Scope* scope) override {
-    const auto* input = scope->FindTensor(x_);
+    const auto* input = scope->FindTensor(input_);
     CHECK(input);
     auto* out = scope->NewTensor(out_);
     CHECK(out);
@@ -45,42 +46,46 @@ class ShapeComputeTester : public arena::TestCase {
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("shape");
-    op_desc->SetInput("Input", {x_});
+    op_desc->SetInput("Input", {input_});
     op_desc->SetOutput("Out", {out_});
   }
 
   void PrepareData() override {
-    std::vector<float> in_data(dims_.production());
-    for (int i = 0; i < dims_.production(); ++i) {
-      in_data[i] = i;
-    }
-    SetCommonTensor(x_, dims_, in_data.data());
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
   }
 };
 
-void test_shape(Place place) {
-  for (int N : {1, 2, 3, 4}) {
-    for (int C : {1, 2, 3, 4}) {
-      for (int H : {1, 2, 3, 4}) {
-        for (int W : {1, 2, 3, 4}) {
-          std::unique_ptr<arena::TestCase> tester(
-              new ShapeComputeTester(place, "def", DDim({N, C, H, W})));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
+void TestShapeHelper(Place place,
+                     float abs_error,
+                     std::vector<int64_t> x_dims) {
+  std::unique_ptr<arena::TestCase> tester(
+      new ShapeComputeTester(place, "def", DDim(x_dims)));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+void test_shape(Place place, float abs_error) {
+  TestShapeHelper(place, abs_error, {2, 3, 4, 5});
+  TestShapeHelper(place, abs_error, {3, 4, 5});
+  TestShapeHelper(place, abs_error, {4, 5});
+  TestShapeHelper(place, abs_error, {5});
 }
 
 TEST(shape, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_shape(place);
+  Place place;
+  float abs_error = 1e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
+#else
+  return;
 #endif
+
+  test_shape(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/yolo_box_compute_test.cc b/lite/tests/kernels/yolo_box_compute_test.cc
index 2e98ce96cef479d55e77acebbe464d9a56f92934..c41c89608fd7496c5b01b1a813581f7f461ff0ee 100644
--- a/lite/tests/kernels/yolo_box_compute_test.cc
+++ b/lite/tests/kernels/yolo_box_compute_test.cc
@@ -228,14 +228,14 @@ class YoloBoxComputeTester : public arena::TestCase {
   }
 };
 
-void test_yolobox(Place place) {
-  for (int class_num : {1, 2, 3, 4}) {
-    for (float conf_thresh : {0.01, 0.2, 0.7}) {
+void TestYoloBox(Place place, float abs_error) {
+  for (int class_num : {1, 4}) {
+    for (float conf_thresh : {0.01, 0.2}) {
       for (int downsample_ratio : {16, 32}) {
-        std::vector<int> anchor({10, 13, 16, 30});
+        std::vector<int> anchor{10, 13, 16, 30, 33, 30};
         std::unique_ptr<arena::TestCase> tester(new YoloBoxComputeTester(
             place, "def", anchor, class_num, conf_thresh, downsample_ratio));
-        arena::Arena arena(std::move(tester), place, 2e-5);
+        arena::Arena arena(std::move(tester), place, abs_error);
         arena.TestPrecision();
       }
     }
@@ -243,13 +243,17 @@ void test_yolobox(Place place) {
 }
 
 TEST(YoloBox, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_yolobox(place);
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
 #endif
+
+  TestYoloBox(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tools/benchmark.sh b/lite/tools/benchmark.sh
index 23bb183ec9711a43def5636f15a9b17795f0ec24..3af8176f97896d04b85195530f9b554fe4ddc5f7 100644
--- a/lite/tools/benchmark.sh
+++ b/lite/tools/benchmark.sh
@@ -2,12 +2,12 @@
 set -e
 
 # Check input
-if [ $# -lt  2 ];
+if [ $# -lt  3 ];
 then
     echo "Input error"
     echo "Usage:"
-    echo "  sh benchmark.sh benchmark_bin_path benchmark_models_path <result_filename> <input_shape> <power_mode: [0|1|2|3]> <is_run_model_optimize: [true|false]> <is_run_quantized_model: [trur|false]>"
-    echo "\npower_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores,  3 for no bind."
+    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename>"
+    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename> <is_run_model_optimize: [true|false]>"
     exit
 fi
 
@@ -15,10 +15,8 @@ fi
 ANDROID_DIR=/data/local/tmp
 BENCHMARK_BIN=$1
 MODELS_DIR=$2
+RESULT_FILENAME=$3
 
-RESULT_FILENAME=result.txt
-INPUT_SHAPE=1,3,244,244
-POWER_MODE=3
 WARMUP=10
 REPEATS=30
 IS_RUN_MODEL_OPTIMIZE=false
@@ -27,25 +25,9 @@ NUM_THREADS_LIST=(1 2 4)
 MODELS_LIST=$(ls $MODELS_DIR)
 
 # Check input
-if [ $# -gt  2 ];
-then
-    RESULT_FILENAME=$3
-fi
 if [ $# -gt  3 ];
 then
-    INPUT_SHAPE=$4
-fi
-if [ $# -gt  4 ];
-then
-    POWER_MODE=$5
-fi
-if [ $# -gt  5 ];
-then
-    IS_RUN_MODEL_OPTIMIZE=$6
-fi
-if [ $# -gt  6 ];
-then
-    IS_RUN_QUANTIZED_MODEL=$7
+    IS_RUN_MODEL_OPTIMIZE=$4
 fi
 
 # Adb push benchmark_bin, models
@@ -54,26 +36,31 @@ adb shell chmod +x $ANDROID_DIR/benchmark_bin
 adb push $MODELS_DIR $ANDROID_DIR
 
 # Run benchmark
-adb shell "echo 'PaddleLite Benchmark (in ms)\n' > $ANDROID_DIR/$RESULT_FILENAME"
+adb shell "echo 'PaddleLite Benchmark' > $ANDROID_DIR/$RESULT_FILENAME"
 for threads in ${NUM_THREADS_LIST[@]}; do
-    adb shell "echo threads=$threads warmup=$WARMUP repeats=$REPEATS input_shape=$INPUT_SHAPE power_mode=$POWER_MODE >> $ANDROID_DIR/$RESULT_FILENAME"
+    adb shell "echo Threads=$threads Warmup=$WARMUP Repeats=$REPEATS >> $ANDROID_DIR/$RESULT_FILENAME"
     for model_name in ${MODELS_LIST[@]}; do
       echo "Model=$model_name Threads=$threads"
-      adb shell "$ANDROID_DIR/benchmark_bin \
+      if [ "$IS_RUN_MODEL_OPTIMIZE" = true ]; 
+      then
+          adb shell "$ANDROID_DIR/benchmark_bin \
                    --model_dir=$ANDROID_DIR/${MODELS_DIR}/$model_name \
-                   --input_shape=$INPUT_SHAPE \
                    --warmup=$WARMUP \
                    --repeats=$REPEATS \
                    --threads=$threads \
-                   --power_mode=$POWER_MODE \
-                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME \
-                   --run_model_optimize=$IS_RUN_MODEL_OPTIMIZE \
-                   --is_quantized_model=$IS_RUN_QUANTIZED_MODEL"
+                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME"
+      else
+          adb shell "$ANDROID_DIR/benchmark_bin \
+                   --optimized_model_path=$ANDROID_DIR/${MODELS_DIR}/$model_name \
+                   --warmup=$WARMUP \
+                   --repeats=$REPEATS \
+                   --threads=$threads \
+                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME"
+      fi
     done
     adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
 done
-adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
-adb shell "echo power_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores,  3 for no bind >> $ANDROID_DIR/$RESULT_FILENAME"
+
 # Adb pull benchmark result, show result
 adb pull $ANDROID_DIR/$RESULT_FILENAME .
 echo "\n--------------------------------------"
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index e28dd6c53e53c477e56e044ada926b4056f1e4e1..d52680c13e4fe6f724456e090587fb85dc53a25e 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -14,6 +14,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
 
 # global variables
 BUILD_EXTRA=OFF
+BUILD_TRAIN=OFF
 BUILD_JAVA=ON
 BUILD_PYTHON=OFF
 BUILD_DIR=$(pwd)
@@ -24,7 +25,10 @@ SHUTDOWN_LOG=ON
 BUILD_NPU=OFF
 NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/
 BUILD_XPU=OFF
+BUILD_XTCL=OFF
 XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/"
+BUILD_RKNPU=OFF
+RKNPU_DDK_ROOT="$(pwd)/rknpu/"
 LITE_WITH_ARM_LANG=OFF
 
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
@@ -137,7 +141,10 @@ function make_tiny_publish_so {
       -DLITE_WITH_NPU=$BUILD_NPU \
       -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
       -DLITE_WITH_XPU=$BUILD_XPU \
+      -DLITE_WITH_XTCL=$BUILD_XTCL \
       -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
+      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
   make publish_inference -j$NUM_PROC
@@ -225,7 +232,11 @@ function make_full_publish_so {
       -DLITE_WITH_NPU=$BUILD_NPU \
       -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
       -DLITE_WITH_XPU=$BUILD_XPU \
+      -DLITE_WITH_XTCL=$BUILD_XTCL \
       -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
+      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
+      -DLITE_WITH_TRAIN=$BUILD_TRAIN \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
   make publish_inference -j$NUM_PROC
@@ -258,7 +269,10 @@ function make_all_tests {
       -DLITE_WITH_NPU=$BUILD_NPU \
       -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
       -DLITE_WITH_XPU=$BUILD_XPU \
+      -DLITE_WITH_XTCL=$BUILD_XTCL \
       -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
+      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
   make lite_compile_deps -j$NUM_PROC
@@ -328,7 +342,10 @@ function make_cuda {
             -DWITH_TESTING=OFF \
             -DLITE_WITH_ARM=OFF \
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
-            -DLITE_BUILD_EXTRA=ON
+            -DLITE_BUILD_EXTRA=ON \
+            -DLITE_WITH_XPU=$BUILD_XPU \
+            -DLITE_WITH_XTCL=$BUILD_XTCL \
+            -DXPU_SDK_ROOT=$XPU_SDK_ROOT
  
   make publish_inference -j$NUM_PROC
   cd -
@@ -356,12 +373,13 @@ function make_x86 {
             -DWITH_LITE=ON \
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
             -DLITE_WITH_ARM=OFF \
-            -DLITE_WITH_PYTHON=$BUILD_PYTHON \
             -DWITH_GPU=OFF \
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON \
-            -DLITE_WITH_XPU=$BUID_XPU \
-            -DXPU_SDK_ROOT=$XPU_SDK_ROOT
+            -DLITE_WITH_XPU=$BUILD_XPU \
+            -DLITE_WITH_XTCL=$BUILD_XTCL \
+            -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+            -DCMAKE_BUILD_TYPE=Release
 
   make publish_inference -j$NUM_PROC
   cd -
@@ -387,6 +405,7 @@ function print_usage {
     echo -e "optional argument:"
     echo -e "--shutdown_log: (OFF|ON); controls whether to shutdown log, default is ON"
     echo -e "--build_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)"
+    echo -e "--build_train: (OFF|ON); controls whether to publish training operators and kernels, build_train is only for full_publish library now"
     echo -e "--build_python: (OFF|ON); controls whether to publish python api lib (ANDROID and IOS is not supported)"
     echo -e "--build_java: (OFF|ON); controls whether to publish java api lib (Only ANDROID is supported)"
     echo -e "--build_dir: directory for building"
@@ -435,6 +454,10 @@ function main {
                 BUILD_EXTRA="${i#*=}"
                 shift
                 ;;
+            --build_train=*)
+                BUILD_TRAIN="${i#*=}"
+                shift
+                ;;
             --build_cv=*)
                 BUILD_CV="${i#*=}"
                 shift
@@ -475,10 +498,22 @@ function main {
                 BUILD_XPU="${i#*=}"
                 shift
                 ;;
+            --build_xtcl=*)
+                BUILD_XTCL="${i#*=}"
+                shift
+                ;;
             --xpu_sdk_root=*)
                 XPU_SDK_ROOT="${i#*=}"
                 shift
                 ;;
+            --build_rknpu=*)
+                BUILD_RKNPU="${i#*=}"
+                shift
+                ;;
+            --rknpu_ddk_root=*)
+                RKNPU_DDK_ROOT="${i#*=}"
+                shift
+                ;;
             tiny_publish)
                 make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL 
                 shift
diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh
index 2c3a8406f7e7c52ecb0268d581e043a3070ba028..964da15b0b6fcf888812271b0a2c944d9efa63b8 100755
--- a/lite/tools/build_bm.sh
+++ b/lite/tools/build_bm.sh
@@ -5,7 +5,7 @@ set -ex
 BM_SDK_ROOT="$(pwd)/third-party/bmlibs/bm_sc3_libs"     # BM SDK
 TARGET_NAME="BM1682"     # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
-WITH_TESTING=ON                    # ON/OFF
+WITH_TESTING=ON                  # ON/OFF
 
 function print_usage {
     echo -e "\nUSAGE:"
diff --git a/lite/tools/build_mlu.sh b/lite/tools/build_mlu.sh
index 5828b243998cff6aac09d08c9fc6d7eecea1dd2c..ba2ec3275baf0d45eb78399d3e2be7c03abdf0f5 100755
--- a/lite/tools/build_mlu.sh
+++ b/lite/tools/build_mlu.sh
@@ -5,7 +5,7 @@ set -ex
 NEUWARE_HOME="${NEUWARE_HOME}"
 TARGET_NAME="all"    # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
-WITH_TESTING=OFF                     # ON/OFF
+WITH_TESTING=ON                     # ON/OFF
 
 function print_usage {
     echo -e "\nUSAGE:"
@@ -70,7 +70,7 @@ function build_mlu {
         -DLITE_WITH_X86=ON \
         -DWITH_MKL=ON \
         -DLITE_WITH_MLU=ON \
-        -DLITE_WITH_PYTHON=ON \
+        -DLITE_WITH_PYTHON=OFF \
         -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
         -DWITH_TESTING=${WITH_TESTING} \
         -DNEUWARE_HOME=${NEUWARE_HOME}
diff --git a/lite/tools/build_rknpu.sh b/lite/tools/build_rknpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..aa2fb5a124077b43f65537ab12715602ab1fe6b8
--- /dev/null
+++ b/lite/tools/build_rknpu.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+set -ex
+
+# global variables with default value
+ARM_OS="armlinux"                    # android only yet
+ARM_ABI="armv8"                     # armv8, armv7
+ARM_LANG="gcc"                      # gcc only yet
+DDK_ROOT="$(pwd)/rknpu"       
+TARGET_NAME="test_subgraph_pass"    # default target
+BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
+WITH_TESTING=ON 	            # ON/OFF
+SHUTDOWN_LOG=OFF                    # ON(disable logging)/OFF
+ON_TINY_PUBLISH=OFF                 # ON(tiny publish)/OFF(full publish)
+
+function print_usage {
+    echo -e "\nUSAGE:"
+    echo
+    echo "----------------------------------------"
+    echo -e "--arm_os=<os> android only yet."
+    echo -e "--arm_abi=<abi> armv8, armv7 yet."
+    echo -e "--arm_lang=<gcc>"
+    echo -e "--ddk_root=<hiai_ddk_root>"
+    echo -e "--target_name=<target_name>"
+    echo "----------------------------------------"
+    echo
+}
+
+# for code gen, a source file is generated after a test, 
+# but is dependended by some targets in cmake.
+# here we fake an empty file to make cmake works.
+function prepare_workspace {
+    # in build directory
+    # 1. Prepare gen_code file
+    GEN_CODE_PATH_PREFIX=lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    # 2.Prepare debug tool
+    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
+    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
+    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
+}
+
+function prepare_thirdparty {
+    readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+
+    readonly workspace=$PWD
+    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+        rm -rf $workspace/third-party
+
+         if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+            wget $THIRDPARTY_TAR
+        fi
+        tar xzf third-party-05b862.tar.gz
+    else
+        git submodule update --init --recursive
+    fi
+}
+
+function build_npu {
+    cur_dir=$(pwd)
+
+    prepare_thirdparty
+
+    local publish_dir
+    if [[ "${ON_TINY_PUBLISH}" == "ON" ]]; then
+        WITH_TESTING=OFF
+        SHUTDOWN_LOG=ON
+        publish_dir="tiny_publish"
+    else
+        publish_dir="full_publish"
+    fi
+    build_dir=$cur_dir/build.lite.rknpu.${ARM_OS}.${ARM_ABI}.${ARM_LANG}.${publish_dir}
+    mkdir -p $build_dir
+    cd $build_dir
+
+    # NPU libs need API LEVEL 24 above
+    prepare_workspace
+    cmake .. \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_LITE=ON \
+        -DLITE_WITH_CUDA=OFF \
+        -DLITE_WITH_X86=OFF \
+        -DLITE_WITH_NPU=OFF \
+        -DLITE_WITH_JAVA=OFF \
+        -DLITE_WITH_ARM=ON \
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON	\
+        -DWITH_ARM_DOTPROD=ON   \
+        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
+        -DWITH_TESTING=${WITH_TESTING} \
+        -DLITE_SHUTDOWN_LOG=${SHUTDOWN_LOG} \
+        -DLITE_ON_TINY_PUBLISH=${ON_TINY_PUBLISH} \
+        -DARM_TARGET_OS=${ARM_OS} \
+        -DARM_TARGET_ARCH_ABI=${ARM_ABI} \
+        -DARM_TARGET_LANG=${ARM_LANG} \
+        -DLITE_WITH_RKNPU=ON \
+        -DRKNPU_DDK_ROOT=${DDK_ROOT}
+
+    make $TARGET_NAME -j2
+
+    cd -
+    echo "Done"
+}
+
+function main {
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            --target_name=*)
+                TARGET_NAME="${i#*=}"
+                shift
+                ;;
+            --arm_os=*)
+                ARM_OS="${i#*=}"
+                shift
+                ;;
+            --arm_abi=*)
+                ARM_ABI="${i#*=}"
+                shift
+                ;;
+            --arm_lang=*)
+                ARM_LANG="${i#*=}"
+                shift
+                ;;
+            --android_stl=*)
+                ANDROID_STL="${i#*=}"
+                shift
+                ;;
+            --build_extra=*)
+                BUILD_EXTRA="${i#*=}"
+                shift
+                ;;
+            --ddk_root=*)
+                DDK_ROOT="${i#*=}"
+                shift
+                ;;
+            build)
+                build_npu
+                shift
+                ;;
+            full_publish)
+                TARGET_NAME=publish_inference
+                build_npu
+                shift
+                ;;
+            tiny_publish)
+                ON_TINY_PUBLISH=ON
+                TARGET_NAME=publish_inference
+                build_npu
+                shift
+                ;;
+            *)
+                # unknown option
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+}
+
+main $@
diff --git a/lite/tools/build_xpu.sh b/lite/tools/build_xpu.sh
deleted file mode 100755
index fdf287501e8f4411f51e73c55b789753f2e85674..0000000000000000000000000000000000000000
--- a/lite/tools/build_xpu.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-set -ex
-
-# global variables with default value
-XPU_SDK_ROOT="$(pwd)/../XPU_SDK"    # XPU SDK
-TARGET_NAME="test_subgraph_pass"    # default target
-BUILD_EXTRA=ON                      # ON(with sequence ops)/OFF
-WITH_TESTING=ON                     # ON/OFF
-
-function print_usage {
-    echo -e "\nUSAGE:"
-    echo
-    echo "----------------------------------------"
-    echo -e "--xpu_sdk_root=<xpu sdk directory>"
-    echo -e "--target_name=<target name>"
-    echo "----------------------------------------"
-    echo
-}
-
-# readonly variables with default value
-readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
-                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-                               -DWITH_PYTHON=OFF \
-                               -DLITE_WITH_ARM=OFF"
-
-readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
-
-readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
-readonly workspace=$(pwd)
-
-function prepare_thirdparty {
-    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
-        rm -rf $workspace/third-party
-
-        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
-            wget $THIRDPARTY_TAR
-        fi
-        tar xzf third-party-05b862.tar.gz
-    else
-        git submodule update --init --recursive
-    fi
-}
-
-# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
-# here we fake an empty file to make cmake works.
-function prepare_workspace {
-    # in build directory
-    # 1. Prepare gen_code file
-    GEN_CODE_PATH_PREFIX=lite/gen_code
-    mkdir -p ./${GEN_CODE_PATH_PREFIX}
-    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
-
-    # 2.Prepare debug tool
-    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
-    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
-    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
-
-    # clone submodule
-    # git submodule update --init --recursive
-    prepare_thirdparty
-}
-
-function build_xpu {
-    build_dir=${workspace}/build.lite.xpu
-    mkdir -p $build_dir
-    cd $build_dir
-
-    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
-    prepare_workspace
-    cmake .. \
-        ${CMAKE_COMMON_OPTIONS} \
-        -DWITH_GPU=OFF \
-        -DWITH_MKLDNN=OFF \
-        -DLITE_WITH_X86=ON \
-        -DWITH_MKL=ON \
-        -DLITE_WITH_XPU=ON \
-        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
-        -DWITH_TESTING=${WITH_TESTING} \
-        -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
-
-    make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
-
-    cd -
-    echo "Done"
-}
-
-function main {
-    # Parse command line.
-    for i in "$@"; do
-        case $i in
-            --target_name=*)
-                TARGET_NAME="${i#*=}"
-                shift
-                ;;
-            --build_extra=*)
-                BUILD_EXTRA="${i#*=}"
-                shift
-                ;;
-            --xpu_sdk_root=*)
-                XPU_SDK_ROOT="${i#*=}"
-                shift
-                ;;
-            build)
-                build_xpu
-                shift
-                ;;
-            full_publish)
-                TARGET_NAME=publish_inference
-                build_xpu
-                shift
-                ;;
-            *)
-                # unknown option
-                print_usage
-                exit 1
-                ;;
-        esac
-    done
-}
-
-main $@
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 703da69fa59f3aa99bad9fb04c0decb591486058..a5dc2b741d2d3d5fdd2f08d13b7dc483a3065b0e 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -192,6 +192,7 @@ function build_opencl {
 
     cmake_opencl ${os} ${abi} ${lang}
     make opencl_clhpp -j$NUM_CORES_FOR_COMPILE
+    make publish_inference -j$NUM_CORES_FOR_COMPILE
     build $TESTS_FILE
 }
 
diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py
index 560174bc632bec89b9655ff89fd5eeb9e7db7786..a89e99bb70a9853eeb103d077173fefbb9a9a399 100644
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -56,8 +56,8 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
 ops_lines = []
 
 # valid targets and valid_ops
-valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"]
-valid_ops = [[], [], [], [], [], [], [], [], [], []]
+valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU"]
+valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[]]
 class TargetType:
     kUnk = 0
     kHost = 1
@@ -68,6 +68,9 @@ class TargetType:
     kFPGA = 7
     kNPU = 8
     kXPU = 9
+    kBM = 10
+    kMLU = 11
+    kRKNPU = 12
     kAny = 6  # any target
 
 # record op_info of valid kernels into `valid_ops` according to different target type
diff --git a/lite/utils/cv/bgr_rotate.cc b/lite/utils/cv/bgr_rotate.cc
index 93d280b89de8b729af3ed2b1c86d6b2c7e8771c8..333bf8575515fe4f5e063f8e55610c111c377571 100644
--- a/lite/utils/cv/bgr_rotate.cc
+++ b/lite/utils/cv/bgr_rotate.cc
@@ -75,7 +75,6 @@ void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
   int64_t stride_h = 4 * win;
   int64_t stride_h_w = 4 * win - 24;
   int ww = w_out - 8;
-  [w_out * h_out * 3];
   // block 8*8. -- 8*8
   int i = 0;
   for (i = 0; i < h_in - 7; i += 8) {
@@ -1134,7 +1133,7 @@ bgr3 bgr2 bgr1
 #ifdef __aarch64__
 void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
   int w_in = w * 3;
-  uint8_t zerobuff[30000];  // [w_in];
+  uint8_t* zerobuff = new uint8_t[w_in];
   memset(zerobuff, 0, w_in * sizeof(uint8_t));
   int64_t stride_w = 24;
   for (int i = 0; i < h_in; i += 4) {
@@ -1332,7 +1331,7 @@ void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
 #else
 void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
   int w_in = w * 3;
-  uint8_t zerobuff[30000];  // w_in
+  uint8_t* zerobuff = new uint8_t[w_in];
   memset(zerobuff, 0, w_in * sizeof(uint8_t));
   int stride_w = 24;
   // 4*8
diff --git a/lite/utils/string.h b/lite/utils/string.h
index 5269525b64f473f1018e183613c087886dba97d6..ada51d0b85d7536bfc937a7b1b8368a0f0e053be 100644
--- a/lite/utils/string.h
+++ b/lite/utils/string.h
@@ -16,6 +16,7 @@
 #include <stdarg.h>  // For va_start, etc.
 #include <algorithm>
 #include <cstring>
+#include <iterator>
 #include <memory>  // For std::unique_ptr
 #include <string>
 #include <vector>
diff --git a/lite/utils/variant.h b/lite/utils/variant.h
index 146ea586e46db0f1145f7ca7a9c5b7bd7bfb432e..2f1606c4585ab5a8feaf2fa6ad49b76ba9d7316d 100644
--- a/lite/utils/variant.h
+++ b/lite/utils/variant.h
@@ -21,12 +21,7 @@ limitations under the License. */
 // https://github.com/PaddlePaddle/Paddle/issues/3386
 
 // some platform-independent defintion
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
+#include "lite/utils/macros.h"
 
 #if !defined(_WIN32)
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/mobile/src/framework/load_ops.h b/mobile/src/framework/load_ops.h
index e04db5d1e8d6e2a75343cbee15269d607f71b7c9..536ab11313528830bf8ec73f68581fba44509f0e 100755
--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
@@ -14,10 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+// some platform-independent defintion
+#include "lite/utils/macros.h"
+
 #ifdef PADDLE_MOBILE_CPU
 #define LOAD_CPU_OP(op_type)                                           \
   extern int TouchOpRegistrar_##op_type##_##cpu();                     \
-  static int use_op_itself_##op_type##_##cpu __attribute__((unused)) = \
+  static int use_op_itself_##op_type##_##cpu UNUSED = \
       TouchOpRegistrar_##op_type##_##cpu()
 #else
 #define LOAD_CPU_OP(op_type)
@@ -26,7 +29,7 @@ limitations under the License. */
 #ifdef PADDLE_MOBILE_CL
 #define LOAD_GPU_CL_OP(op_type)                                       \
   extern int TouchOpRegistrar_##op_type##_##cl();                     \
-  static int use_op_itself_##op_type##_##cl __attribute__((unused)) = \
+  static int use_op_itself_##op_type##_##cl UNUSED = \
       TouchOpRegistrar_##op_type##_##cl()
 #else
 #define LOAD_GPU_CL_OP(op_type)
@@ -35,7 +38,7 @@ limitations under the License. */
 #ifdef PADDLE_MOBILE_FPGA
 #define LOAD_FPGA_OP(op_type)                                           \
   extern int TouchOpRegistrar_##op_type##_##fpga();                     \
-  static int use_op_itself_##op_type##_##fpga __attribute__((unused)) = \
+  static int use_op_itself_##op_type##_##fpga UNUSED = \
       TouchOpRegistrar_##op_type##_##fpga()
 #else
 #define LOAD_FPGA_OP(op_type)
@@ -43,7 +46,7 @@ limitations under the License. */
 
 #define LOAD_FUSION_MATCHER(op_type)                                       \
   extern int TouchFusionMatcherRegistrar_##op_type();                      \
-  static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \
+  static int use_fusion_matcher_itself_##op_type UNUSED = \
       TouchFusionMatcherRegistrar_##op_type();
 
 #define LOAD_OP(op_type)   \
diff --git a/mobile/test/CMakeLists.txt b/mobile/test/CMakeLists.txt
index 078440f45b0525ce49140ad78b2f9c23bb0f55f1..6dddeb47f6e33446d136a8d1301834aa17fceeb8 100644
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
@@ -549,7 +549,7 @@ if (ENABLE_ALL_TEST)
         ADD_EXECUTABLE(test-net-performance net/test_net_performance.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-net-performance paddle-mobile)
 
-        ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h)
+        ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-inference-api-v2 paddle-mobile)
 
         if (GPU_CL)
@@ -566,6 +566,6 @@ else ()
     ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
     target_link_libraries(test-net-benchmark paddle-mobile)
 
-    ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-inference-api-v2 paddle-mobile)
 endif ()
diff --git a/mobile/test/net/test_inference_api_v2.cpp b/mobile/test/net/test_inference_ercy.cpp
similarity index 100%
rename from mobile/test/net/test_inference_api_v2.cpp
rename to mobile/test/net/test_inference_ercy.cpp
diff --git a/mobile/test/net/test_inference_m2fm.cpp b/mobile/test/net/test_inference_m2fm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe03c99cda992b06c49e0165ad64d8289f165880
--- /dev/null
+++ b/mobile/test/net/test_inference_m2fm.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "io/paddle_inference_api.h"
+
+using namespace paddle_mobile;  // NOLINT
+
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kGPU_CL;
+  config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST;
+
+  config.prog_file = "../models/m2fm/model";
+  config.param_file = "../models/m2fm/params";
+  config.lod_mode = false;
+  config.load_when_predict = false;
+  return config;
+}
+
+int main() {
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+
+  // factor
+  int factor_len = 1 * 256 * 1 * 1;
+  std::vector<float> factor_v;
+  std::vector<int64_t> factor_dims{1, 256, 1, 1};
+  GetInput<float>(g_test_image_1x3x224x224, &factor_v, factor_dims);
+
+  PaddleTensor factor;
+  factor.shape = std::vector<int>({1, 256, 1, 1});
+  factor.data = PaddleBuf(factor_v.data(), factor_len * sizeof(float));
+  factor.dtype = PaddleDType::FLOAT32;
+  factor.layout = LayoutType::LAYOUT_CHW;
+
+  // remap
+  int remap_len = 1 * 256 * 256 * 2;
+  std::vector<float> remap_v;
+  std::vector<int64_t> remap_dims{1, 256, 256, 2};
+  GetInput<float>(g_test_image_1x3x224x224, &remap_v, remap_dims);
+
+  PaddleTensor remap;
+  remap.shape = std::vector<int>({1, 256, 256, 2});
+  remap.data = PaddleBuf(remap_v.data(), remap_len * sizeof(float));
+  remap.dtype = PaddleDType::FLOAT32;
+  remap.layout = LayoutType::LAYOUT_CHW;
+
+  // image
+  int image_len = 1 * 3 * 256 * 256;
+  std::vector<float> image_v;
+  std::vector<int64_t> image_dims{1, 3, 256, 256};
+  GetInput<float>(g_test_image_1x3x224x224, &image_v, image_dims);
+
+  PaddleTensor image;
+  image.shape = std::vector<int>({1, 3, 256, 256});
+  image.data = PaddleBuf(image_v.data(), image_len * sizeof(float));
+  image.dtype = PaddleDType::FLOAT32;
+  image.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output0;
+  output0.shape = std::vector<int>({});
+  output0.data = PaddleBuf();
+  output0.dtype = PaddleDType::FLOAT32;
+  output0.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output1;
+  output1.shape = std::vector<int>({});
+  output1.data = PaddleBuf();
+  output1.dtype = PaddleDType::FLOAT32;
+  output1.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output2;
+  output2.shape = std::vector<int>({});
+  output2.data = PaddleBuf();
+  output2.dtype = PaddleDType::FLOAT32;
+  output2.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output3;
+  output3.shape = std::vector<int>({});
+  output3.data = PaddleBuf();
+  output3.dtype = PaddleDType::FLOAT32;
+  output3.layout = LayoutType::LAYOUT_CHW;
+
+  predictor->Feed("x2paddle_mul_factor", factor);
+  predictor->Feed("x2paddle_base_remap", remap);
+  predictor->Feed("x2paddle_image", image);
+  predictor->Run();
+  predictor->Fetch("save_infer_model/scale_0", &output0);
+  predictor->Fetch("save_infer_model/scale_1", &output1);
+  predictor->Fetch("save_infer_model/scale_2", &output2);
+  predictor->Fetch("save_infer_model/scale_3", &output3);
+
+  float* out_ptr0 = reinterpret_cast<float*>(output0.data.data());
+  float* out_ptr1 = reinterpret_cast<float*>(output1.data.data());
+  std::cout << " print output0 : " << std::endl;
+  int numel = output0.data.length() / sizeof(float);
+  int stride = numel / 20;
+  stride = stride > 0 ? stride : 1;
+  for (size_t j = 0; j < numel; j += stride) {
+    std::cout << out_ptr0[j] << " ";
+  }
+  std::cout << std::endl;
+
+  std::cout << " print output1 : " << std::endl;
+  numel = output1.data.length() / sizeof(float);
+  stride = numel / 20;
+  stride = stride > 0 ? stride : 1;
+  for (size_t j = 0; j < numel; j += stride) {
+    std::cout << out_ptr1[j] << " ";
+  }
+  std::cout << std::endl;
+
+  return 0;
+}
diff --git a/mobile/tools/build.sh b/mobile/tools/build.sh
index 741e6a590e685a0f723f364336ac1dc6061fe0ba..3dc579ecf09c20028d8f845876d35497c12fa35b 100755
--- a/mobile/tools/build.sh
+++ b/mobile/tools/build.sh
@@ -130,7 +130,7 @@ build_for_arm_linux() {
             -B"../build/release/arm-linux" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
             -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-            -DCMAKE_CXX_FLAGS="-std=c++14 -mcpu=cortex-a53 -mtune=cortex-a53 -ftree-vectorize -funsafe-math-optimizations  -pipe -mlittle-endian " \
+            -DCMAKE_CXX_FLAGS=" " \
             -DNET="${NETS}" \
             -D"V7"=true
     else
@@ -138,7 +138,7 @@ build_for_arm_linux() {
             -B"../build/release/arm-linux" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
             -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-            -DCMAKE_CXX_FLAGS="-std=c++14 -mcpu=cortex-a53 -mtune=cortex-a53 -ftree-vectorize -funsafe-math-optimizations -pipe -mlittle-endian " \
+            -DCMAKE_CXX_FLAGS=" " \
             -DNET="${NETS}" \
             -D"V7"=true
     fi
diff --git a/out b/out
new file mode 100644
index 0000000000000000000000000000000000000000..4671cd7553a385365df86a5ffcfbd4e77e2ca672
--- /dev/null
+++ b/out
@@ -0,0 +1,451 @@
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800   1) // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800   2) //
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800   3) // Licensed under the Apache License, Version 2.0 (the "License");
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800   4) // you may not use this file except in compliance with the License.
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800   5) // You may obtain a copy of the License at
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800   6) //
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800   7) //     http://www.apache.org/licenses/LICENSE-2.0
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800   8) //
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800   9) // Unless required by applicable law or agreed to in writing, software
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  10) // distributed under the License is distributed on an "AS IS" BASIS,
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  11) // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  12) // See the License for the specific language governing permissions and
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  13) // limitations under the License.
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  14) 
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800  15) #include "lite/kernels/host/multiclass_nms_compute.h"
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800  16) #include <map>
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800  17) #include <utility>
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800  18) #include <vector>
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  19) 
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  20) namespace paddle {
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  21) namespace lite {
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800  22) namespace kernels {
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800  23) namespace host {
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  24) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  25) template <class T>
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  26) bool SortScorePairDescend(const std::pair<float, T>& pair1,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  27)                           const std::pair<float, T>& pair2) {
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  28)   return pair1.first > pair2.first;
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  29) }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  30) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  31) template <class T>
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  32) static void GetMaxScoreIndex(const std::vector<T>& scores,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  33)                              const T threshold,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  34)                              int top_k,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  35)                              std::vector<std::pair<T, int>>* sorted_indices) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  36)   for (size_t i = 0; i < scores.size(); ++i) {
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  37)     if (scores[i] > threshold) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  38)       sorted_indices->push_back(std::make_pair(scores[i], i));
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  39)     }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  40)   }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  41)   // Sort the score pair according to the scores in descending order
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  42)   std::stable_sort(sorted_indices->begin(),
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  43)                    sorted_indices->end(),
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  44)                    SortScorePairDescend<int>);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  45)   // Keep top_k scores if needed.
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  46)   if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  47)     sorted_indices->resize(top_k);
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  48)   }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  49) }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  50) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  51) template <class T>
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  52) static T BBoxArea(const T* box, const bool normalized) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  53)   if (box[2] < box[0] || box[3] < box[1]) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  54)     // If coordinate values are is invalid
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  55)     // (e.g. xmax < xmin or ymax < ymin), return 0.
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  56)     return static_cast<T>(0.);
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  57)   } else {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  58)     const T w = box[2] - box[0];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  59)     const T h = box[3] - box[1];
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  60)     if (normalized) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  61)       return w * h;
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  62)     } else {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  63)       // If coordinate values are not within range [0, 1].
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  64)       return (w + 1) * (h + 1);
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  65)     }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  66)   }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  67) }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  68) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  69) template <class T>
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  70) static T JaccardOverlap(const T* box1, const T* box2, const bool normalized) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  71)   if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  72)       box2[3] < box1[1]) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  73)     return static_cast<T>(0.);
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  74)   } else {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  75)     const T inter_xmin = std::max(box1[0], box2[0]);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  76)     const T inter_ymin = std::max(box1[1], box2[1]);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  77)     const T inter_xmax = std::min(box1[2], box2[2]);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  78)     const T inter_ymax = std::min(box1[3], box2[3]);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  79)     T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  80)     T inter_w = inter_xmax - inter_xmin + norm;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  81)     T inter_h = inter_ymax - inter_ymin + norm;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  82)     const T inter_area = inter_w * inter_h;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  83)     const T bbox1_area = BBoxArea<T>(box1, normalized);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  84)     const T bbox2_area = BBoxArea<T>(box2, normalized);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  85)     return inter_area / (bbox1_area + bbox2_area - inter_area);
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  86)   }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  87) }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  88) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  89) template <class T>
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  90) T PolyIoU(const T* box1,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  91)           const T* box2,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  92)           const size_t box_size,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  93)           const bool normalized) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  94)   LOG(FATAL) << "PolyIoU not implement.";
+b80194db lite/kernels/host/multiclass_nms_compute.cc (huzhiqiang        2020-04-03 20:18:11 +0800  95)   return *box1;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  96) }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800  97) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  98) template <class T>
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800  99) void SliceOneClass(const Tensor& items,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 100)                    const int class_id,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 101)                    Tensor* one_class_item) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 102)   T* item_data = one_class_item->mutable_data<T>();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 103)   const T* items_data = items.data<T>();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 104)   const int64_t num_item = items.dims()[0];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 105)   const int64_t class_num = items.dims()[1];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 106)   if (items.dims().size() == 3) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 107)     int64_t item_size = items.dims()[2];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 108)     for (int i = 0; i < num_item; ++i) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 109)       std::memcpy(item_data + i * item_size,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 110)                   items_data + i * class_num * item_size + class_id * item_size,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 111)                   sizeof(T) * item_size);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 112)     }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 113)   } else {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 114)     for (int i = 0; i < num_item; ++i) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 115)       item_data[i] = items_data[i * class_num + class_id];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 116)     }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 117)   }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 118) }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 119) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 120) template <typename T>
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 121) void NMSFast(const Tensor& bbox,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 122)              const Tensor& scores,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 123)              const T score_threshold,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 124)              const T nms_threshold,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 125)              const T eta,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 126)              const int64_t top_k,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 127)              std::vector<int>* selected_indices,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 128)              const bool normalized) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 129)   // The total boxes for each instance.
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 130)   int64_t num_boxes = bbox.dims()[0];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 131)   // 4: [xmin ymin xmax ymax]
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 132)   // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 133)   // 16, 24, or 32: [x1 y1 x2 y2 ...  xn yn], n = 8, 12 or 16
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 134)   int64_t box_size = bbox.dims()[1];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 135) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 136)   std::vector<T> scores_data(num_boxes);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 137)   std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 138)   std::vector<std::pair<T, int>> sorted_indices;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 139)   GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 140) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 141)   selected_indices->clear();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 142)   T adaptive_threshold = nms_threshold;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 143)   const T* bbox_data = bbox.data<T>();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 144) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 145)   while (sorted_indices.size() != 0) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 146)     const int idx = sorted_indices.front().second;
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 147)     bool keep = true;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 148)     for (size_t k = 0; k < selected_indices->size(); ++k) {
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 149)       if (keep) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 150)         const int kept_idx = (*selected_indices)[k];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 151)         T overlap = T(0.);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 152)         // 4: [xmin ymin xmax ymax]
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 153)         if (box_size == 4) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 154)           overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 155)                                       bbox_data + kept_idx * box_size,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 156)                                       normalized);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 157)         }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 158)         // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 159)         if (box_size == 8 || box_size == 16 || box_size == 24 ||
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 160)             box_size == 32) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 161)           overlap = PolyIoU<T>(bbox_data + idx * box_size,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 162)                                bbox_data + kept_idx * box_size,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 163)                                box_size,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 164)                                normalized);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 165)         }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 166)         keep = overlap <= adaptive_threshold;
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 167)       } else {
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 168)         break;
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 169)       }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 170)     }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 171)     if (keep) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 172)       selected_indices->push_back(idx);
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 173)     }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 174)     sorted_indices.erase(sorted_indices.begin());
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 175)     if (keep && eta < 1 && adaptive_threshold > 0.5) {
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 176)       adaptive_threshold *= eta;
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 177)     }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 178)   }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 179) }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 180) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 181) template <typename T>
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 182) void MultiClassNMS(const operators::MulticlassNmsParam& param,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 183)                    const Tensor& scores,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 184)                    const Tensor& bboxes,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 185)                    const int scores_size,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 186)                    std::map<int, std::vector<int>>* indices,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 187)                    int* num_nmsed_out) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 188)   int64_t background_label = param.background_label;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 189)   int64_t nms_top_k = param.nms_top_k;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 190)   int64_t keep_top_k = param.keep_top_k;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 191)   bool normalized = param.normalized;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 192)   T nms_threshold = static_cast<T>(param.nms_threshold);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 193)   T nms_eta = static_cast<T>(param.nms_eta);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 194)   T score_threshold = static_cast<T>(param.score_threshold);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 195) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 196)   int num_det = 0;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 197) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 198)   int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 199)   Tensor bbox_slice, score_slice;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 200)   for (int64_t c = 0; c < class_num; ++c) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 201)     if (c == background_label) continue;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 202)     if (scores_size == 3) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 203)       score_slice = scores.Slice<T>(c, c + 1);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 204)       bbox_slice = bboxes;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 205)     } else {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 206)       score_slice.Resize({scores.dims()[0], 1});
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 207)       bbox_slice.Resize({scores.dims()[0], 4});
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 208)       SliceOneClass<T>(scores, c, &score_slice);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 209)       SliceOneClass<T>(bboxes, c, &bbox_slice);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 210)     }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 211)     NMSFast(bbox_slice,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 212)             score_slice,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 213)             score_threshold,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 214)             nms_threshold,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 215)             nms_eta,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 216)             nms_top_k,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 217)             &((*indices)[c]),
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 218)             normalized);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 219)     if (scores_size == 2) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 220)       std::stable_sort((*indices)[c].begin(), (*indices)[c].end());
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 221)     }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 222)     num_det += (*indices)[c].size();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 223)   }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 224) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 225)   *num_nmsed_out = num_det;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 226)   const T* scores_data = scores.data<T>();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 227)   if (keep_top_k > -1 && num_det > keep_top_k) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 228)     const T* sdata;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 229)     std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 230)     for (const auto& it : *indices) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 231)       int label = it.first;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 232)       if (scores_size == 3) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 233)         sdata = scores_data + label * scores.dims()[1];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 234)       } else {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 235)         score_slice.Resize({scores.dims()[0], 1});
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 236)         SliceOneClass<T>(scores, label, &score_slice);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 237)         sdata = score_slice.data<T>();
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 238)       }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 239)       const std::vector<int>& label_indices = it.second;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 240)       for (size_t j = 0; j < label_indices.size(); ++j) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 241)         int idx = label_indices[j];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 242)         score_index_pairs.push_back(
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 243)             std::make_pair(sdata[idx], std::make_pair(label, idx)));
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 244)       }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 245)     }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 246)     // Keep top k results per image.
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 247)     std::stable_sort(score_index_pairs.begin(),
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 248)                      score_index_pairs.end(),
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 249)                      SortScorePairDescend<std::pair<int, int>>);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 250)     score_index_pairs.resize(keep_top_k);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 251) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 252)     // Store the new indices.
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 253)     std::map<int, std::vector<int>> new_indices;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 254)     for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 255)       int label = score_index_pairs[j].second.first;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 256)       int idx = score_index_pairs[j].second.second;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 257)       new_indices[label].push_back(idx);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 258)     }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 259)     if (scores_size == 2) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 260)       for (const auto& it : new_indices) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 261)         int label = it.first;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 262)         std::stable_sort(new_indices[label].begin(), new_indices[label].end());
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 263)       }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 264)     }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 265)     new_indices.swap(*indices);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 266)     *num_nmsed_out = keep_top_k;
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 267)   }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 268) }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 269) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 270) template <typename T>
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 271) void MultiClassOutput(const Tensor& scores,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 272)                       const Tensor& bboxes,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 273)                       const std::map<int, std::vector<int>>& selected_indices,
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 274)                       const int scores_size,
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 275)                       Tensor* outs,
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 276)                       int* oindices = nullptr,
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 277)                       const int offset = 0) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 278)   int64_t class_num = scores.dims()[1];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 279)   int64_t predict_dim = scores.dims()[1];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 280)   int64_t box_size = bboxes.dims()[1];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 281)   if (scores_size == 2) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 282)     box_size = bboxes.dims()[2];
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 283)   }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 284)   int64_t out_dim = box_size + 2;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 285)   auto* scores_data = scores.data<T>();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 286)   auto* bboxes_data = bboxes.data<T>();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 287)   auto* odata = outs->mutable_data<T>();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 288)   const T* sdata;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 289)   Tensor bbox;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 290)   bbox.Resize({scores.dims()[0], box_size});
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 291)   int count = 0;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 292)   for (const auto& it : selected_indices) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 293)     int label = it.first;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 294)     const std::vector<int>& indices = it.second;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 295)     if (scores_size == 2) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 296)       SliceOneClass<T>(bboxes, label, &bbox);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 297)     } else {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 298)       sdata = scores_data + label * predict_dim;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 299)     }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 300)     for (size_t j = 0; j < indices.size(); ++j) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 301)       int idx = indices[j];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 302)       odata[count * out_dim] = label;  // label
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 303)       const T* bdata;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 304)       if (scores_size == 3) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 305)         bdata = bboxes_data + idx * box_size;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 306)         odata[count * out_dim + 1] = sdata[idx];  // score
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 307)         if (oindices != nullptr) {
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 308)           oindices[count] = offset + idx;
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 309)         }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 310)       } else {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 311)         bdata = bbox.data<T>() + idx * box_size;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 312)         odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 313)         if (oindices != nullptr) {
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 314)           oindices[count] = offset + idx * class_num + label;
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 315)         }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 316)       }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 317)       // xmin, ymin, xmax, ymax or multi-points coordinates
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 318)       std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 319)       count++;
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 320)     }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 321)   }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 322) }
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 323) 
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 324) void MulticlassNmsCompute::Run() {
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 325)   auto& param = Param<operators::MulticlassNmsParam>();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 326)   auto* boxes = param.bboxes;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 327)   auto* scores = param.scores;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 328)   auto* outs = param.out;
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 329)   bool return_index = param.index ? true : false;
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 330)   auto* index = param.index;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 331)   auto score_dims = scores->dims();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 332)   auto score_size = score_dims.size();
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 333) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 334)   std::vector<std::map<int, std::vector<int>>> all_indices;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 335)   std::vector<uint64_t> batch_starts = {0};
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 336)   int64_t batch_size = score_dims[0];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 337)   int64_t box_dim = boxes->dims()[2];
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 338)   int64_t out_dim = box_dim + 2;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 339)   int num_nmsed_out = 0;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 340)   Tensor boxes_slice, scores_slice;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 341)   int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 342)   for (int i = 0; i < n; ++i) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 343)     if (score_size == 3) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 344)       scores_slice = scores->Slice<float>(i, i + 1);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 345)       scores_slice.Resize({score_dims[1], score_dims[2]});
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 346)       boxes_slice = boxes->Slice<float>(i, i + 1);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 347)       boxes_slice.Resize({score_dims[2], box_dim});
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 348)     } else {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 349)       auto boxes_lod = boxes->lod().back();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 350)       scores_slice = scores->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 351)       boxes_slice = boxes->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 352)     }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 353)     std::map<int, std::vector<int>> indices;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 354)     MultiClassNMS<float>(
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 355)         param, scores_slice, boxes_slice, score_size, &indices, &num_nmsed_out);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 356)     all_indices.push_back(indices);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 357)     batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 358)   }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 359) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 360)   uint64_t num_kept = batch_starts.back();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 361)   if (num_kept == 0) {
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 362)     if (return_index) {
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 363)       outs->Resize({0, out_dim});
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 364)       index->Resize({0, 1});
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 365)     } else {
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 366)       outs->Resize({1, 1});
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 367)       float* od = outs->mutable_data<float>();
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 368)       od[0] = -1;
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 369)       batch_starts = {0, 1};
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 370)     }
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 371)   } else {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 372)     outs->Resize({static_cast<int64_t>(num_kept), out_dim});
+00000000 lite/kernels/host/multiclass_nms_compute.cc (Not Committed Yet 2020-04-15 02:36:38 +0000 373) <<<<<<< HEAD
+99f9b310 lite/kernels/host/multiclass_nms_compute.cc (dingminghui       2020-04-07 16:59:12 +0800 374)     (void)outs->mutable_data<float>();
+00000000 lite/kernels/host/multiclass_nms_compute.cc (Not Committed Yet 2020-04-15 02:36:38 +0000 375) =======
+d571eb4e lite/kernels/host/multiclass_nms_compute.cc (zhupengyang       2020-04-02 10:36:08 +0800 376)     outs->mutable_data<float>();
+00000000 lite/kernels/host/multiclass_nms_compute.cc (Not Committed Yet 2020-04-15 02:36:38 +0000 377) >>>>>>> upstream/develop
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 378)     int offset = 0;
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 379)     int* oindices = nullptr;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 380)     for (int i = 0; i < n; ++i) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 381)       if (score_size == 3) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 382)         scores_slice = scores->Slice<float>(i, i + 1);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 383)         boxes_slice = boxes->Slice<float>(i, i + 1);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 384)         scores_slice.Resize({score_dims[1], score_dims[2]});
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 385)         boxes_slice.Resize({score_dims[2], box_dim});
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 386)         if (return_index) {
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 387)           offset = i * score_dims[2];
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 388)         }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 389)       } else {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 390)         auto boxes_lod = boxes->lod().back();
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 391)         scores_slice = scores->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 392)         boxes_slice = boxes->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 393)         if (return_index) {
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 394)           offset = boxes_lod[i] * score_dims[1];
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 395)         }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 396)       }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 397)       int64_t s = static_cast<int64_t>(batch_starts[i]);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 398)       int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 399)       if (e > s) {
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 400)         Tensor out = outs->Slice<float>(s, e);
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 401)         if (return_index) {
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 402)           index->Resize({static_cast<int64_t>(num_kept), 1});
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 403)           int* output_idx = index->mutable_data<int>();
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 404)           oindices = output_idx + s;
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 405)         }
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 406)         MultiClassOutput<float>(scores_slice,
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 407)                                 boxes_slice,
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 408)                                 all_indices[i],
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 409)                                 score_dims.size(),
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 410)                                 &out,
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 411)                                 oindices,
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 412)                                 offset);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 413)       }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 414)     }
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 415)   }
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 416) 
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 417)   LoD lod;
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 418)   lod.emplace_back(batch_starts);
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 419)   if (return_index) {
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 420)     index->set_lod(lod);
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 421)   }
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 422)   outs->set_lod(lod);
+deaddf9d lite/kernels/host/multiclass_nms_compute.cc (juncaipeng        2019-09-03 15:39:16 +0800 423) }
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 424) }  // namespace host
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 425) }  // namespace kernels
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 426) }  // namespace lite
+699d6cd0 lite/arm/math/multiclass_nms.cc             (Yan Chunwei       2019-08-16 22:39:39 +0800 427) }  // namespace paddle
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 428) 
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 429) REGISTER_LITE_KERNEL(multiclass_nms,
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 430)                      kHost,
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 431)                      kFloat,
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 432)                      kNCHW,
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 433)                      paddle::lite::kernels::host::MulticlassNmsCompute,
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 434)                      def)
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 435)     .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 436)     .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 437)     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+0679feed lite/kernels/host/multiclass_nms_compute.cc (yiicy             2020-02-20 11:39:43 +0800 438)     .Finalize();
+0679feed lite/kernels/host/multiclass_nms_compute.cc (yiicy             2020-02-20 11:39:43 +0800 439) 
+0679feed lite/kernels/host/multiclass_nms_compute.cc (yiicy             2020-02-20 11:39:43 +0800 440) REGISTER_LITE_KERNEL(multiclass_nms2,
+0679feed lite/kernels/host/multiclass_nms_compute.cc (yiicy             2020-02-20 11:39:43 +0800 441)                      kHost,
+0679feed lite/kernels/host/multiclass_nms_compute.cc (yiicy             2020-02-20 11:39:43 +0800 442)                      kFloat,
+0679feed lite/kernels/host/multiclass_nms_compute.cc (yiicy             2020-02-20 11:39:43 +0800 443)                      kNCHW,
+0679feed lite/kernels/host/multiclass_nms_compute.cc (yiicy             2020-02-20 11:39:43 +0800 444)                      paddle::lite::kernels::host::MulticlassNmsCompute,
+0679feed lite/kernels/host/multiclass_nms_compute.cc (yiicy             2020-02-20 11:39:43 +0800 445)                      def)
+0679feed lite/kernels/host/multiclass_nms_compute.cc (yiicy             2020-02-20 11:39:43 +0800 446)     .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
+0679feed lite/kernels/host/multiclass_nms_compute.cc (yiicy             2020-02-20 11:39:43 +0800 447)     .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+0679feed lite/kernels/host/multiclass_nms_compute.cc (yiicy             2020-02-20 11:39:43 +0800 448)     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 449)     .BindOutput("Index",
+e1c4adfd lite/kernels/host/multiclass_nms_compute.cc (yiicy             2019-12-24 16:10:19 +0800 450)                 {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+de43e479 lite/kernels/host/multiclass_nms_compute.cc (Wilber            2019-08-29 21:24:46 +0800 451)     .Finalize();