diff --git a/CMakeLists.txt b/CMakeLists.txt
index aefe8cc19c586381aea83645e80b1fd700959bbc..065bcbe3490d7d8ba92dbd17d115d7fefe5c1ec6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 message(STATUS "AR tools: ${CMAKE_AR}")
 
+
+if(WIN32)
+    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+
+    set(CMAKE_SUPPRESS_REGENERATION ON)
+    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+
+    if (MSVC_STATIC_CRT)
+      set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+      set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+      set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+      set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    endif()
+
+    add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
+    add_compile_options(/MP)
+    message(STATUS "Using parallel compiling (/MP)")
+    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
+    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+
+endif()
+
 if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     find_package(CUDA QUIET)
 endif()
@@ -59,10 +84,12 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_RKNPU  "Enable RKNPU in lite mode"  OFF)
 lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
+lite_option(LITE_WITH_APU  "Enable APU in lite mode"  OFF)
 lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
@@ -105,9 +132,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
+    if(WIN32)
+        set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+        FORCE)
+    else()
+    
     set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
             "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
             FORCE)
+    endif()
 endif()
 message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
 
@@ -129,6 +163,10 @@ if (LITE_WITH_PYTHON)
     include(external/pybind11)    # download, build, install pybind11
 endif()
 
+if(LITE_WITH_RKNPU)
+   include(device/rknpu)
+endif()
+
 
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
@@ -136,6 +174,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     include(cross_compiling/postproject)
     include(device/npu) # check and prepare NPU DDK
     include(device/xpu) # check and prepare XPU SDK
+    include(device/apu) # check and prepare APU SDK
 
     # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON
     # So the following third party dependencies are not needed.
@@ -185,6 +224,7 @@ endif()
 
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
+
 include(external/libxsmm)   # download, build, install libxsmm
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -209,7 +249,9 @@ include(generic)            # simplify cmake module
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(version)            # set PADDLE_VERSION
-include(flags)
+if(NOT APPLE)
+  include(flags)
+endif()
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/build.bat b/build.bat
new file mode 100644
index 0000000000000000000000000000000000000000..4510ee774ed9a3b9fe5a9d55b405b1dae39c3f45
--- /dev/null
+++ b/build.bat
@@ -0,0 +1,134 @@
+@echo off
+setlocal
+setlocal enabledelayedexpansion
+
+set source_path=%~dp0
+rem  global variables
+set BUILD_EXTRA=OFF
+set BUILD_JAVA=ON
+set BUILD_PYTHON=OFF
+set BUILD_DIR=%source_path%
+set OPTMODEL_DIR=""
+set BUILD_TAILOR=OFF
+set BUILD_CV=OFF
+set SHUTDOWN_LOG=ON  
+
+set THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+
+set workspace=%source_path%
+
+:set_vcvarsall_dir
+SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat   =======>"
+set tmp_var=!vcvarsall_dir!
+call:remove_space
+set vcvarsall_dir=!tmp_var!   
+IF NOT EXIST "%vcvarsall_dir%" (
+    echo "------------%vcvarsall_dir% not exist------------"
+    goto set_vcvarsall_dir
+)
+
+call:prepare_thirdparty
+
+if EXIST "%build_directory%" (
+    call:rm_rebuild_dir "%build_directory%"
+    md "%build_directory%"
+) 
+
+set root_dir=%workspace%
+set build_directory=%BUILD_DIR%\build.lite.x86
+set GEN_CODE_PATH_PREFIX=%build_directory%\lite\gen_code
+set DEBUG_TOOL_PATH_PREFIX=%build_directory%\lite\tools\debug
+
+rem for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
+rem here we fake an empty file to make cmake works.
+if NOT EXIST "%GEN_CODE_PATH_PREFIX%" (
+    md "%GEN_CODE_PATH_PREFIX%"
+)
+
+type nul >"%GEN_CODE_PATH_PREFIX%\__generated_code__.cc"
+
+if NOT EXIST "%DEBUG_TOOL_PATH_PREFIX%" (
+     md "%DEBUG_TOOL_PATH_PREFIX%"
+)
+
+copy "%root_dir%\lite\tools\debug\analysis_tool.py" "%DEBUG_TOOL_PATH_PREFIX%\"
+
+cd "%build_directory%"
+
+  cmake ..   -G "Visual Studio 14 2015 Win64" -T host=x64  -DWITH_MKL=ON      ^
+            -DWITH_MKLDNN=OFF   ^
+            -DLITE_WITH_X86=ON  ^
+            -DLITE_WITH_PROFILE=OFF ^
+            -DWITH_LITE=ON ^
+            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF ^
+            -DLITE_WITH_ARM=OFF ^
+            -DWITH_GPU=OFF ^
+            -DLITE_BUILD_EXTRA=ON ^
+            -DLITE_WITH_PYTHON=ON ^
+            -DPYTHON_EXECUTABLE="%python_path%"
+
+call "%vcvarsall_dir%" amd64
+
+msbuild /m /p:Configuration=Release lite\publish_inference.vcxproj >mylog.txt 2>&1
+goto:eof
+
+:prepare_thirdparty 
+    SET /P python_path="Please input the path of python.exe, such as C:\Python35\python.exe, C:\Python35\python3.exe   =======>"
+    set tmp_var=!python_path!
+    call:remove_space
+    set python_path=!tmp_var!   
+    if "!python_path!"=="" (
+      set python_path=python.exe
+    ) else (
+      if NOT exist "!python_path!" (
+        echo "------------!python_path! not exist------------" 
+        goto:eof
+      )  
+    )
+
+    if  EXIST "%workspace%\third-party" (
+        if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
+            echo "The directory of third_party exists, the third-party-05b862.tar.gz not exists."            
+        ) else (
+               echo "The directory of third_party exists, the third-party-05b862.tar.gz exists."
+               call:rm_rebuild_dir "%workspace%\third-party"
+               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        )
+    ) else (
+        if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
+            echo "The directory of third_party not exists, the third-party-05b862.tar.gz not exists."
+            call:download_third_party
+            !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        ) else (
+            echo "The directory of third_party not exists, the third-party-05b862.tar.gz exists."
+               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        )
+
+    )
+    git submodule update --init --recursive
+goto:eof
+
+:download_third_party
+powershell.exe (new-object System.Net.WebClient).DownloadFile('https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz', ^
+'%workspace%third-party-05b862.tar.gz')
+goto:eof
+
+:rm_rebuild_dir
+    del /f /s /q "%~1\*.*"  >nul 2>&1
+    rd /s /q  "%~1" >nul 2>&1
+goto:eof
+
+
+:remove_space
+:remove_left_space
+if "%tmp_var:~0,1%"==" " (
+    set "tmp_var=%tmp_var:~1%"
+    goto remove_left_space
+)
+
+:remove_right_space
+if "%tmp_var:~-1%"==" " (
+    set "tmp_var=%tmp_var:~0,-1%"
+    goto remove_left_space
+)
+goto:eof
\ No newline at end of file
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index caf456367047277344f0353b6c142b039a81b12c..cf99645409436f24533005b9a74f2bdb1c89f662 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -34,6 +34,15 @@ elseif(SSE3_FOUND)
     set(SIMD_FLAG ${SSE3_FLAG})
 endif()
 
+if(WIN32)
+  # windows header option for all targets.
+  add_definitions(-D_XKEYCHECK_H)
+  
+  if (NOT MSVC)
+    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
+  endif(NOT MSVC)
+endif(WIN32)
+
 if(LITE_WITH_CUDA)
     add_definitions(-DLITE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
@@ -70,7 +79,7 @@ endif()
 
 if (WITH_MKLML AND MKLML_IOMP_LIB)
     message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if(WIN32)
+    if(WIN32 OR APPLE)
         # openmp not support well for now on windows
         set(OPENMP_FLAGS "")
     else(WIN32)
@@ -134,6 +143,14 @@ if (LITE_WITH_NPU)
     add_definitions("-DLITE_WITH_NPU")
 endif()
 
+if (LITE_WITH_APU)
+    add_definitions("-DLITE_WITH_APU")
+endif()
+
+if (LITE_WITH_RKNPU)
+    add_definitions("-DLITE_WITH_RKNPU")
+endif()
+
 if (LITE_WITH_XPU)
     add_definitions("-DLITE_WITH_XPU")
     if (LITE_WITH_XTCL)
@@ -181,3 +198,6 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
   add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL")
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 
+if (LITE_WITH_PYTHON)
+  add_definitions("-DLITE_WITH_PYTHON")
+endif(LITE_WITH_PYTHON)
diff --git a/cmake/device/apu.cmake b/cmake/device/apu.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..d32e77f867ba3a7628475f8ea06816aa14097442
--- /dev/null
+++ b/cmake/device/apu.cmake
@@ -0,0 +1,65 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+
+if(NOT DEFINED APU_DDK_ROOT)
+    set(APU_DDK_ROOT $ENV{APU_DDK_ROOT})
+    if(NOT APU_DDK_ROOT)
+        message(FATAL_ERROR "Must set APU_DDK_ROOT or env APU_DDK_ROOT when LITE_WITH_APU=ON")
+    endif()
+endif()
+
+message(STATUS "APU_DDK_ROOT: ${APU_DDK_ROOT}")
+find_path(APU_DDK_INC NAMES NeuronAdapter.h
+  PATHS ${APU_DDK_ROOT}/include NO_DEFAULT_PATH)
+if(NOT APU_DDK_INC)
+  message(FATAL_ERROR "Can not find NeuronAdapter.h in ${APU_DDK_ROOT}/include")
+endif()
+message(STATUS "APU_DDK_INC: ${APU_DDK_INC}")
+
+include_directories("${APU_DDK_ROOT}/include")
+
+set(APU_SUB_LIB_PATH "lib64")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(APU_SUB_LIB_PATH "lib64")
+endif()
+
+find_library(APU_NEURON_FILE NAMES neuron
+  PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
+
+find_library(APU_NEURON_ADAPTER_FILE NAMES neuron_adapter
+  PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
+
+if(NOT APU_NEURON_FILE)
+  message(FATAL_ERROR "Can not find APU_NEURON_FILE in ${APU_DDK_ROOT}")
+else()
+  message(STATUS "Found APU NEURON Library: ${APU_NEURON_FILE}")
+  add_library(apu_neuron SHARED IMPORTED GLOBAL)
+  set_property(TARGET apu_neuron PROPERTY IMPORTED_LOCATION ${APU_NEURON_FILE})
+endif()
+
+if(NOT APU_NEURON_ADAPTER_FILE)
+  message(FATAL_ERROR "Can not find APU_NEURON_ADAPTER_FILE in ${APU_DDK_ROOT}")
+else()
+  message(STATUS "Found APU NEURON ADAPTER Library: ${APU_NEURON_ADAPTER_FILE}")
+  add_library(apu_neuron_adapter SHARED IMPORTED GLOBAL)
+  set_property(TARGET apu_neuron_adapter PROPERTY IMPORTED_LOCATION ${APU_NEURON_ADAPTER_FILE})
+endif()
+
+set(apu_runtime_libs apu_neuron apu_neuron_adapter CACHE INTERNAL "apu runtime libs")
+message(STATUS "${apu_runtime_libs}")
+
diff --git a/cmake/device/rknpu.cmake b/cmake/device/rknpu.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7d430888072b0219bba3112534818d2e10a55579
--- /dev/null
+++ b/cmake/device/rknpu.cmake
@@ -0,0 +1,55 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+
+if(NOT DEFINED RKNPU_DDK_ROOT)
+    set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT})
+    if(NOT RKNPU_DDK_ROOT)
+        message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON")
+    endif()
+endif()
+
+message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}")
+find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h
+  PATHS ${RKNPU_DDK_ROOT}/include/  NO_DEFAULT_PATH)
+if(NOT RKNPU_DDK_INC)
+  message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include")
+endif()
+
+include_directories("${RKNPU_DDK_ROOT}/include")
+
+set(RKNPU_SUB_LIB_PATH "lib64")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(RKNPU_SUB_LIB_PATH "lib64")
+endif()
+
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+    set(RKNPU_SUB_LIB_PATH "lib")
+endif()
+
+find_library(RKNPU_DDK_FILE NAMES rknpu_ddk
+  PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH})
+
+if(NOT RKNPU_DDK_FILE)
+  message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}")
+else()
+  message(STATUS "Found RKNPU_DDK_FILE  Library: ${RKNPU_DDK_FILE}")
+  add_library(rknpu_ddk  SHARED IMPORTED GLOBAL)
+  set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE})
+endif()
+
+set(rknpu_runtime_libs rknpu_ddk  CACHE INTERNAL "rknpu ddk runtime libs")
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 599e7bba7eaf12da7506ce44e706bd9f50ec6998..5a757659bb036ca99326bc40cc075f761ba6e641 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -36,7 +36,16 @@ else()
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
         GIT_TAG
-        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        ######################################################################################################
+        # url address of eigen before v2.3.0
+        # URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        ######################################################################################################
+        # url address of eigen since  v2.6.0
+        #         github address: https://github.com/eigenteam/eigen-git-mirror
+        # we changed the source code to adapt for windows compiling
+        #         git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
+        ######################################################################################################
+        URL             https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
         DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
         DOWNLOAD_NO_PROGRESS  1
         PREFIX          ${EIGEN_SOURCE_DIR}
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 142fce816de4f06aa0a36b91e3e4ecb962a8dc2a..8d094d6e064fe57b170d1a50a5457c104d3c3ac2 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,12 +16,6 @@ IF(NOT ${WITH_MKLML})
   return()
 ENDIF(NOT ${WITH_MKLML})
 
-IF(APPLE)
-    MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
-    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
-    return()
-ENDIF()
-
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@@ -38,7 +32,17 @@ IF(WIN32)
     SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
     SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
+    SET(MKLML_SHARED_LIB_DEPS     ${MKLML_LIB_DIR}/msvcr120.dll)
     SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
+ELSEIF(APPLE)
+    #TODO(intel-huying):
+    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
+    SET(MKLML_VER "mklml_mac_2019.0.5.20190502" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlelite-data.bj.bcebos.com/third_party_libs/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml.dylib)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.dylib)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml.dylib)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.dylib)
 ELSE()
     #TODO(intel-huying):
     #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index ae99f4df9a3676ae8f5b2c4c01305ead9b7a8254..57e332f1c103b28a194670de609ee521aa41cdf3 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -70,10 +70,10 @@ SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 SET(py_env "")
 IF(PYTHONINTERP_FOUND)
     find_python_module(pip REQUIRED)
-    find_python_module(numpy REQUIRED)
+    #find_python_module(numpy REQUIRED)
     #find_python_module(wheel REQUIRED)
     #find_python_module(google.protobuf REQUIRED)
-    FIND_PACKAGE(NumPy REQUIRED)
+    #FIND_PACKAGE(NumPy REQUIRED)
     #IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
     #    MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
     #    "please use pip to upgrade protobuf. pip install -U protobuf")
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 225a3c19a16435c4df6403ff7d1bdd01e628dd72..d859404d559282970d96a735c400f745481e8efa 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -276,7 +276,7 @@ function(cc_library TARGET_NAME)
         add_dependencies(${TARGET_NAME} mklml)
         if(WIN32)
           target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
-        else(WIN32)
+        elseif(NOT APPLE)
           target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
         endif(WIN32)
       endif()
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index a07edaa57533e35943aedc5dbf812598d6215714..8408a79fa4265b08771e435dcc5e82801a9d40f9 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
   cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(deps ${lite_deps_DEPS})
@@ -88,6 +88,18 @@ function (lite_deps TARGET)
     endforeach(var)
   endif()
 
+  if (LITE_WITH_APU)
+    foreach(var ${lite_deps_APU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
+  if (LITE_WITH_RKNPU)
+    foreach(var ${lite_deps_RKNPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
   if (LITE_WITH_XPU)
     foreach(var ${lite_deps_XPU_DEPS})
       set(deps ${deps} ${var})
@@ -131,7 +143,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
     set(options SHARED shared STATIC static MODULE module)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
       HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -142,10 +154,12 @@ function(lite_cc_library TARGET)
             CUDA_DEPS ${args_CUDA_DEPS}
             CL_DEPS ${args_CL_DEPS}
             BM_DEPS ${args_BM_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             CV_DEPS ${args_CV_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
             NPU_DEPS ${args_NPU_DEPS}
+            APU_DEPS ${args_APU_DEPS}
             XPU_DEPS ${args_XPU_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -161,8 +175,10 @@ function(lite_cc_library TARGET)
     else()
         cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
     endif()
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
 
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
     # collect targets need to compile for lite
     if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS)
         add_dependencies(lite_compile_deps ${TARGET})
@@ -177,7 +193,7 @@ function(lite_cc_binary TARGET)
         set(options " -g ")
     endif()
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
       LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -190,8 +206,10 @@ function(lite_cc_binary TARGET)
             ARM_DEPS ${args_ARM_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
             NPU_DEPS ${args_NPU_DEPS}
+            APU_DEPS ${args_APU_DEPS}
             XPU_DEPS ${args_XPU_DEPS}
-	    BM_DEPS ${args_BM_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
+            BM_DEPS ${args_BM_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
@@ -199,7 +217,9 @@ function(lite_cc_binary TARGET)
             MLU_DEPS ${args_MLU_DEPS}
             )
     cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
     if (NOT APPLE)
         # strip binary target to reduce size
         if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
@@ -226,7 +246,7 @@ function(lite_cc_test TARGET)
     endif()
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
         ARGS
         COMPILE_LEVEL # (basic|extra)
@@ -247,8 +267,10 @@ function(lite_cc_test TARGET)
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
@@ -263,7 +285,9 @@ function(lite_cc_test TARGET)
                 "${TARGET}"
                 COMMENT "Strip debug symbols done on final executable file.")
     endif()
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
     file(APPEND ${offline_test_registry_file} "${TARGET}\n")
 
     # collect targets need to compile for lite
@@ -277,9 +301,11 @@ set(x86_kernels CACHE INTERNAL "x86 kernels")
 set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
+set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
+set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")
 
@@ -295,12 +321,12 @@ if(LITE_BUILD_TAILOR)
   file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
+# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -323,6 +349,12 @@ function(add_kernel TARGET device level)
 
 
     if ("${device}" STREQUAL "Host")
+       if (LITE_ON_MODEL_OPTIMIZE_TOOL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
         set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "ARM")
@@ -352,6 +384,15 @@ function(add_kernel TARGET device level)
         endif()
         set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
+    if ("${device}" STREQUAL "APU")
+        if (NOT LITE_WITH_APU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(apu_kernels "${apu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
     if ("${device}" STREQUAL "XPU")
         if (NOT LITE_WITH_XPU)
             foreach(src ${args_SRCS})
@@ -379,8 +420,20 @@ function(add_kernel TARGET device level)
         endif()
         set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
+    if ("${device}" STREQUAL "RKNPU")
+        if (NOT LITE_WITH_RKNPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
     if ("${device}" STREQUAL "MLU")
         if (NOT LITE_WITH_MLU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
@@ -423,8 +476,10 @@ function(add_kernel TARGET device level)
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
               MLU_DEPS ${args_MLU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -444,7 +499,7 @@ endif()
 function(add_operator TARGET level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -477,8 +532,10 @@ function(add_operator TARGET level)
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
               MLU_DEPS ${args_MLU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -486,6 +543,29 @@ function(add_operator TARGET level)
       )
 endfunction()
 
+#only for windows 
+function(create_static_lib TARGET_NAME)
+  set(libs ${ARGN})
+  list(REMOVE_DUPLICATES libs)
+    set(dummy_index 1)
+    set(dummy_offset 1)
+    # the dummy target would be consisted of limit size libraries
+    set(dummy_limit 60)
+    list(LENGTH libs libs_len)
+
+    foreach(lib ${libs})
+      list(APPEND dummy_list ${lib})
+      list(LENGTH dummy_list listlen)
+      if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len}))
+        merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list})
+        set(dummy_list)
+        list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index})
+        MATH(EXPR dummy_index "${dummy_index}+1")
+      endif()
+      MATH(EXPR dummy_offset "${dummy_offset}+1")
+    endforeach()
+    merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list})
+endfunction()
 
 # Bundle several static libraries into one.
 function(bundle_static_library tgt_name bundled_tgt_name fake_target)
@@ -529,7 +609,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target)
   set(bundled_tgt_full_name
     ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX})
 
-  #message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}")
+  message(STATUS "bundled_tgt_full_name:  ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  
+  if(WIN32)
+    set(dummy_tgt_name dummy_${bundled_tgt_name})
+    create_static_lib(${bundled_tgt_name} ${static_libs})
+    add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_name})
+    add_dependencies(${fake_target} ${tgt_name})
+  
+    add_library(${dummy_tgt_name} STATIC IMPORTED)
+    set_target_properties(${dummy_tgt_name}
+      PROPERTIES
+        IMPORTED_LOCATION ${bundled_tgt_full_name}
+        INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:${tgt_name},INTERFACE_INCLUDE_DIRECTORIES>)
+    add_dependencies(${dummy_tgt_name} ${fake_target})
+    return()
+  endif()
 
   if(NOT IOS)
     file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index e7c4e5fcc5c00929058f11160d0f87d13cbe7f4b..e2b15b187bf6dd3b77fe353f23b5d65bf56e44c7 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -7,7 +7,9 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
 message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
+message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
+message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
 message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
@@ -70,12 +72,18 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (LITE_WITH_XPU)
         set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu")
     endif(LITE_WITH_XPU)
+    if (LITE_WITH_APU)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.apu")
+    endif(LITE_WITH_APU)
     if (LITE_WITH_FPGA)
         set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
     endif(LITE_WITH_FPGA)
     if (LITE_WITH_BM)
         set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
     endif(LITE_WITH_BM)
+    if (LITE_WITH_RKNPU)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu")
+    endif(LITE_WITH_RKNPU)
 else()
     set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
 endif()
@@ -83,16 +91,59 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
 
 # add python lib
 if (LITE_WITH_PYTHON)
-    add_custom_target(publish_inference_python_lib ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
-            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    if(WIN32)   
+        set(LITE_CORE "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd")
+        set(LITE_CORE_DEPS ${LITE_CORE})
+        add_custom_command(OUTPUT   ${LITE_CORE}
+            COMMAND cmake -E copy $<TARGET_FILE:lite_pybind> ${LITE_CORE}
+            DEPENDS lite_pybind)
+        add_custom_target(copy_lite_pybind ALL DEPENDS ${LITE_CORE_DEPS})
+        
+        add_custom_target(publish_inference_python_lib ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.pyd"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.pyd"
+            DEPENDS copy_lite_pybind
+            )
+            
+        add_custom_target(publish_inference_python_installer ${TARGET}
+            COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
+            DEPENDS  publish_inference_python_lib)
+        add_custom_target(publish_inference_python_light_demo ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/python"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_full_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
+            )
+        add_dependencies(publish_inference publish_inference_python_lib)
+        add_dependencies(publish_inference publish_inference_python_installer)
+        add_dependencies(publish_inference publish_inference_python_light_demo)
+    else()
+    if(APPLE)
+        add_custom_target(publish_inference_python_lib ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    else()
+        add_custom_target(publish_inference_python_lib ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    endif()
     add_custom_target(publish_inference_python_installer ${TARGET}
-        COMMAND python setup.py bdist_wheel
+        COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
         WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
         DEPENDS publish_inference_python_lib)
     add_custom_target(publish_inference_python_light_demo ${TARGET}
@@ -108,30 +159,78 @@ if (LITE_WITH_PYTHON)
     add_dependencies(publish_inference publish_inference_python_lib)
     add_dependencies(publish_inference publish_inference_python_installer)
     add_dependencies(publish_inference publish_inference_python_light_demo)
+    endif(WIN32)
 endif()
 
 if (LITE_WITH_CUDA OR LITE_WITH_X86)
-    add_custom_target(publish_inference_cxx_lib ${TARGET}
-        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
-        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-        COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-        COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-        COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-        COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-        )
-    add_custom_target(publish_inference_third_party ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-            COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
-    add_dependencies(publish_inference_cxx_lib bundle_full_api)
-    add_dependencies(publish_inference_cxx_lib bundle_light_api)
-    add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
-    add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
-    add_dependencies(publish_inference publish_inference_cxx_lib)
-    add_dependencies(publish_inference publish_inference_third_party)
+    if(APPLE)
+        add_custom_target(publish_inference_cxx_lib ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.dylib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            )
+        add_custom_target(publish_inference_third_party ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+                COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+        add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
+        add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
+        add_dependencies(publish_inference publish_inference_cxx_lib)
+        add_dependencies(publish_inference publish_inference_third_party)
+    elseif(NOT WIN32)
+        add_custom_target(publish_inference_cxx_lib ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            )
+        add_custom_target(publish_inference_third_party ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+                COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+        add_dependencies(publish_inference_cxx_lib bundle_full_api)
+        add_dependencies(publish_inference_cxx_lib bundle_light_api)
+        add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
+        add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
+        add_dependencies(publish_inference publish_inference_cxx_lib)
+        add_dependencies(publish_inference publish_inference_third_party)
+    endif()
 endif()
 
 if (LITE_WITH_X86)
+  if(WIN32)
+        add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_place.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_passes.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_lite_factory_helper.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_full_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_light_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        )
+
+        add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
+        add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
+        add_dependencies(publish_inference publish_inference_x86_cxx_lib)
+
+        add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+        )
+        add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
+        add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3)
+
+  else()
+
     add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
             COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
             COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
@@ -146,6 +245,7 @@ if (LITE_WITH_X86)
     add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
     add_dependencies(publish_inference publish_inference_x86_cxx_lib)
     add_dependencies(publish_inference publish_inference_x86_cxx_demos)
+  endif()
 endif()
 
 if(LITE_WITH_CUDA)
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 503926978937e788c38b8f08d9d3dd71980918af..0f60b13f35d51d3917425df75d3f157f8b5a87c3 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -23,6 +23,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
         if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
             add_dependencies(paddle_full_api_shared dynload_mklml)
         endif()
+        if(WIN32)
+             target_link_libraries(paddle_full_api_shared shlwapi.lib)
+        endif()
     endif()
     if(LITE_WITH_CUDA)
         target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
@@ -34,15 +37,20 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                   ARM_DEPS ${arm_kernels}
                   CV_DEPS paddle_cv_arm
                   NPU_DEPS ${npu_kernels}
+                  APU_DEPS ${apu_kernels}
+                  RKNPU_DEPS ${rknpu_kernels}
                   )
+
     add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
-    set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
-    add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
-    add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
-    set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
-    add_dependencies(paddle_full_api_shared custom_linker_map)
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
+    if(NOT APPLE AND NOT WIN32)
+        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
+        set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
+        add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
+        add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
+        set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
+        add_dependencies(paddle_full_api_shared custom_linker_map)
+   endif()
 else()
     if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
         add_library(paddle_light_api_shared SHARED "")
@@ -57,6 +65,11 @@ else()
             # Need to add HIAI runtime libs (libhiai.so) dependency
             target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
         endif()
+        if (LITE_WITH_RKNPU)
+            # Need to add RKNPU runtime libs dependency
+            target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs})
+        endif()
+
     endif()
 endif()
 
@@ -67,8 +80,11 @@ if (WITH_TESTING)
       CUDA_DEPS ${cuda_kernels}
       X86_DEPS ${x86_kernels}
       XPU_DEPS ${xpu_kernels}
+      RKNPU_DEPS ${rknpu_kernels}
       BM_DEPS ${bm_kernels}
-      MLU_DEPS ${mlu_kernels})
+      MLU_DEPS ${mlu_kernels}
+      APU_DEPS ${apu_kernels})
+
 endif()
 if(LITE_WITH_FPGA)
     set(light_api_deps ${light_api_deps} ${fpga_deps})
@@ -80,6 +96,12 @@ if(LITE_WITH_BM)
     set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
 endif()
 
+if(LITE_WITH_RKNPU)
+    set(light_api_deps ${light_api_deps} ${rknpu_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
+endif()
+
+
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
 message(STATUS "get CUDA kernels ${cuda_kernels}")
@@ -87,7 +109,9 @@ message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get OpenCL kernels ${opencl_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
+message(STATUS "get APU kernels ${apu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
+message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
 message(STATUS "get MLU kernels ${mlu_kernels}")
@@ -105,6 +129,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
                         CV_DEPS paddle_cv_arm
                         NPU_DEPS ${npu_kernels}
                         XPU_DEPS ${xpu_kernels}
+                        APU_DEPS ${apu_kernels}
+                        RKNPU_DEPS ${rknpu_kernels}
                         BM_DEPS ${bm_kernels}
                         CL_DEPS ${opencl_kernels}
                         FPGA_DEPS ${fpga_kernels})
@@ -125,7 +151,9 @@ lite_cc_library(light_api SRCS light_api.cc
         ARM_DEPS ${arm_kernels}
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
         XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         BM_DEPS ${bm_kernels}
@@ -144,7 +172,9 @@ if(WITH_TESTING)
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+       APU_DEPS ${apu_kernels}
        XPU_DEPS ${xpu_kernels}
+       RKNPU_DEPS ${rknpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
@@ -200,7 +230,7 @@ if(WITH_TESTING)
 endif()
 
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
-    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${fpga_kernels})
+    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels})
 
     lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc
        DEPS ${lite_model_test_DEPS}
@@ -246,6 +276,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
             --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
     add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
+
    # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
    # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
    #    DEPS ${lite_model_test_DEPS})
@@ -271,6 +302,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
         ARM_DEPS ${arm_kernels}
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         BM_DEPS ${bm_kernels})
@@ -289,6 +321,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
         DEPS light_api program mir_passes paddle_api_light
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         BM_DEPS ${bm_kernels}
         ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
 
@@ -298,6 +331,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
         X86_DEPS ${x86_kernels}
         XPU_DEPS ${xpu_kernels}
         FPGA_DEPS ${fpga_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         BM_DEPS ${bm_kernels}
         MLU_DEPS ${mlu_kernels}
         ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
@@ -333,6 +367,8 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
   CV_DEPS paddle_cv_arm
   NPU_DEPS ${npu_kernels}
   XPU_DEPS ${xpu_kernels}
+  APU_DEPS ${apu_kernels}
+  RKNPU_DEPS ${rknpu_kernels}
   CL_DEPS ${opencl_kernels}
   X86_DEPS ${x86_kernels}
   FPGA_DEPS ${fpga_kernels}
@@ -352,8 +388,10 @@ if(NOT IOS)
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
         MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
@@ -365,8 +403,10 @@ if(NOT IOS)
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
         MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
@@ -378,8 +418,10 @@ if(NOT IOS)
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
         MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
@@ -390,7 +432,9 @@ if(NOT IOS)
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
@@ -401,19 +445,24 @@ if(NOT IOS)
         ARM_DEPS ${arm_kernels}
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
         XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         MLU_DEPS ${mlu_kernels}
         CL_DEPS ${opencl_kernels}
 	BM_DEPS ${bm_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
+
     lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
         ARM_DEPS ${arm_kernels}
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
+        RKNPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
+        APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc
index 0843faf0d6b060a5b76a850de069b1dbf714da19..0ce7f6f0d5aa5bb5c7bc66dbeddaa618fa6466e6 100644
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -13,7 +13,13 @@
 // limitations under the License.
 
 #include <gflags/gflags.h>
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#include <windows.h>
+#include "lite/backends/x86/port.h"
+#endif
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include <time.h>
 #include <algorithm>
 #include <cstdio>
@@ -27,6 +33,9 @@
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 
+DEFINE_string(optimized_model_path,
+              "",
+              "the path of the model that is optimized by opt.");
 DEFINE_string(model_dir,
               "",
               "the path of the model, the model and param files is under "
@@ -61,10 +70,7 @@ DEFINE_int32(threads, 1, "threads num");
 DEFINE_string(result_filename,
               "result.txt",
               "save the inference time to the file.");
-DEFINE_bool(run_model_optimize,
-            false,
-            "if set true, apply model_optimize_tool to "
-            "model and use optimized model to test. ");
+DEFINE_bool(show_output, false, "Wether to show the output in shell.");
 
 namespace paddle {
 namespace lite_api {
@@ -100,15 +106,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
   LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
 }
 
+int64_t ShapeProduction(const std::vector<int64_t>& shape) {
+  int64_t num = 1;
+  for (auto i : shape) {
+    num *= i;
+  }
+  return num;
+}
+
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 void Run(const std::vector<int64_t>& input_shape,
-         const std::string& model_dir,
+         const std::string& model_path,
          const std::string model_name) {
   // set config and create predictor
   lite_api::MobileConfig config;
   config.set_threads(FLAGS_threads);
   config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
-  config.set_model_from_file(model_dir + ".nb");
+  config.set_model_from_file(model_path);
 
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
@@ -116,10 +130,7 @@ void Run(const std::vector<int64_t>& input_shape,
   auto input_tensor = predictor->GetInput(0);
   input_tensor->Resize(input_shape);
   auto input_data = input_tensor->mutable_data<float>();
-  int input_num = 1;
-  for (size_t i = 0; i < input_shape.size(); ++i) {
-    input_num *= input_shape[i];
-  }
+  int64_t input_num = ShapeProduction(input_shape);
   if (FLAGS_input_img_path.empty()) {
     for (int i = 0; i < input_num; ++i) {
       input_data[i] = 1.f;
@@ -167,26 +178,78 @@ void Run(const std::vector<int64_t>& input_shape,
   ofs << "average = " << std::setw(12) << avg_res;
   ofs << std::endl;
   ofs.close();
+
+  if (FLAGS_show_output) {
+    auto out_tensor = predictor->GetOutput(0);
+    auto* out_data = out_tensor->data<float>();
+    int64_t output_num = ShapeProduction(out_tensor->shape());
+    float max_value = out_data[0];
+    int max_index = 0;
+    for (int i = 0; i < output_num; i++) {
+      if (max_value < out_data[i]) {
+        max_value = out_data[i];
+        max_index = i;
+      }
+    }
+    LOG(INFO) << "max_value:" << max_value;
+    LOG(INFO) << "max_index:" << max_index;
+    LOG(INFO) << "output data[0:10]:";
+    for (int i = 0; i < 10; i++) {
+      LOG(INFO) << out_data[i];
+    }
+  }
 }
 #endif
 
 }  // namespace lite_api
 }  // namespace paddle
 
+void print_usage() {
+  std::string help_info =
+      "Usage: \n"
+      "./benchmark_bin \n"
+      "  --optimized_model_path (The path of the model that is optimized\n"
+      "    by opt. If the model is optimized, please set the param.) \n"
+      "    type: string \n"
+      "  --model_dir (The path of the model that is not optimized by opt,\n"
+      "    the model and param files is under model_dir.) type: string \n"
+      "  --model_filename (The filename of model file. When the model is\n "
+      "    combined formate, please set model_file. Otherwise, it is not\n"
+      "    necessary to set it.) type: string \n"
+      "  --param_filename (The filename of param file, set param_file when\n"
+      "    the model is combined formate. Otherwise, it is not necessary\n"
+      "    to set it.) type: string \n"
+      "  --input_shape (Set input shapes according to the model, separated by\n"
+      "    colon and comma, such as 1,3,244,244) type: string\n"
+      "    default: 1,3,224,224 \n"
+      "  --input_img_path (The path of input image, if not set\n"
+      "    input_img_path, the input will be 1.0.) type: string \n "
+      "  --power_mode (Arm power mode: 0 for big cluster, 1 for little\n"
+      "    cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n"
+      "  --repeats (Repeats times) type: int32 default: 1 \n"
+      "  --result_filename (Save the inference time to the file.) type: \n"
+      "    string default: result.txt \n"
+      "  --threads (Threads num) type: int32 default: 1 \n"
+      "  --warmup (Warmup times) type: int32 default: 0 \n"
+      "Note that: \n"
+      "  If load the optimized model, set optimized_model_path. Otherwise, \n"
+      "    set model_dir, model_filename and param_filename according to \n"
+      "    the model. \n";
+  LOG(INFO) << help_info;
+}
+
 int main(int argc, char** argv) {
+  // Check inputs
   gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_model_dir == "") {
-    LOG(INFO) << "Please run ./benchmark_bin --help to obtain usage.";
+  bool is_opt_model = (FLAGS_optimized_model_path != "");
+  bool is_origin_model = (FLAGS_model_dir != "");
+  if (!is_origin_model && !is_opt_model) {
+    LOG(INFO) << "Input error, the model path should not be empty.\n";
+    print_usage();
     exit(0);
   }
 
-  if (FLAGS_model_dir.back() == '/') {
-    FLAGS_model_dir.pop_back();
-  }
-  std::size_t found = FLAGS_model_dir.find_last_of("/");
-  std::string model_name = FLAGS_model_dir.substr(found + 1);
-  std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2";
-
+  // Get input shape
   auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
     std::vector<int64_t> shape;
     std::string tmp_str = str_shape;
@@ -202,19 +265,31 @@ int main(int argc, char** argv) {
     }
     return shape;
   };
-
   std::vector<int64_t> input_shape = get_shape(FLAGS_input_shape);
 
-  // Output optimized model if needed
-  if (FLAGS_run_model_optimize) {
-    paddle::lite_api::OutputOptModel(save_optimized_model_dir);
+  // Get model_name and run_model_path
+  std::string model_name;
+  std::string run_model_path;
+  if (is_origin_model) {
+    if (FLAGS_model_dir.back() == '/') {
+      FLAGS_model_dir.pop_back();
+    }
+    std::size_t found = FLAGS_model_dir.find_last_of("/");
+    model_name = FLAGS_model_dir.substr(found + 1);
+    std::string optimized_model_path = FLAGS_model_dir + "_opt2";
+    paddle::lite_api::OutputOptModel(optimized_model_path);
+    run_model_path = optimized_model_path + ".nb";
+  } else {
+    size_t found1 = FLAGS_optimized_model_path.find_last_of("/");
+    size_t found2 = FLAGS_optimized_model_path.find_last_of(".");
+    size_t len = found2 - found1 - 1;
+    model_name = FLAGS_optimized_model_path.substr(found1 + 1, len);
+    run_model_path = FLAGS_optimized_model_path;
   }
 
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  // Run inference using optimized model
-  std::string run_model_dir =
-      FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
-  paddle::lite_api::Run(input_shape, run_model_dir, model_name);
+  // Run test
+  paddle::lite_api::Run(input_shape, run_model_path, model_name);
 #endif
   return 0;
 }
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index 3f3428b434e98ffb0ba578ef7f31a4fbcd9ca619..f4dcac519a0699cbcf1bdd3845d8ae90d7a289ed 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -292,9 +292,10 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
   program_desc_ = desc;
   // `inner_places` is used to optimize passes
   std::vector<Place> inner_places = valid_places;
-  inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
-  inner_places.emplace_back(
-      TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  for (auto &valid_place : valid_places) {
+    inner_places.emplace_back(
+        Place(TARGET(kHost), valid_place.precision, valid_place.layout));
+  }
 
   // Analysis whether the modle is quantized.
   // For quantized model, add place(arm, int8) to inner_places
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index ccd7c981385ff776c47c01fbfdd058001341dff6..28e87dca394ba06844269746c19a892c26e0c653 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -20,24 +20,32 @@
 #include "lite/core/device_info.h"
 #include "lite/core/version.h"
 
+#ifndef LITE_ON_TINY_PUBLISH
+#include "lite/api/paddle_use_passes.h"
+#endif
+
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
-    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
 #include <omp.h>
 #include "lite/backends/x86/mklml.h"
 #endif
-
 namespace paddle {
 namespace lite {
 
 void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   config_ = config;
   auto places = config.valid_places();
+  std::vector<std::string> passes{};
 #ifdef LITE_WITH_CUDA
   // if kCUDA is included in valid places, it should be initialized first,
   // otherwise skip this step.
   for (auto &p : places) {
     if (p.target == TARGET(kCUDA)) {
       Env<TARGET(kCUDA)>::Init();
+      if (config_.multi_stream()) {
+        passes = {"multi_stream_analysis_pass"};
+        VLOG(3) << "add pass: " << passes[0];
+      }
       break;
     }
   }
@@ -51,7 +59,6 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
                                            config.mlu_first_conv_std(),
                                            config.mlu_input_layout());
 #endif  // LITE_WITH_MLU
-  std::vector<std::string> passes{};
   auto use_layout_preprocess_pass =
       config.model_dir().find("OPENCL_PRE_PRECESS");
   VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
@@ -63,9 +70,8 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   raw_predictor_.Build(config, places, passes);
   mode_ = config.power_mode();
   threads_ = config.threads();
-
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
-    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
   int num_threads = config.x86_math_library_num_threads();
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index d82869dbef00929b70a87e05b91ef4a82630bbbe..65ce77276afdb4c3b7a7247cdb8ae120497d8145 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -29,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file,
     LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
   }
 
+  // For weight quantization of post training, load the int8/16 weights
+  // for optimized model, and dequant it to fp32.
   DequantizeWeight();
+
   BuildRuntimeProgram(cpp_program_desc_);
   PrepareFeedFetch();
 }
@@ -79,7 +82,7 @@ Tensor* LightPredictor::GetInputByName(const std::string& name) {
   if (element == input_names_.end()) {
     LOG(ERROR) << "Model do not have input named with: [" << name
                << "], model's inputs include:";
-    for (int i = 0; i < input_names_.size(); i++) {
+    for (size_t i = 0; i < input_names_.size(); i++) {
       LOG(ERROR) << "[" << input_names_[i] << "]";
     }
     return nullptr;
@@ -111,7 +114,7 @@ void LightPredictor::PrepareFeedFetch() {
   auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
   std::vector<cpp::OpDesc*> feeds;
   std::vector<cpp::OpDesc*> fetchs;
-  for (int i = 0; i < current_block->OpsSize(); i++) {
+  for (size_t i = 0; i < current_block->OpsSize(); i++) {
     auto op = current_block->GetOp<cpp::OpDesc>(i);
     if (op->Type() == "feed") {
       feeds.push_back(op);
@@ -121,11 +124,11 @@ void LightPredictor::PrepareFeedFetch() {
   }
   input_names_.resize(feeds.size());
   output_names_.resize(fetchs.size());
-  for (int i = 0; i < feeds.size(); i++) {
+  for (size_t i = 0; i < feeds.size(); i++) {
     input_names_[feeds[i]->GetAttr<int>("col")] =
         feeds[i]->Output("Out").front();
   }
-  for (int i = 0; i < fetchs.size(); i++) {
+  for (size_t i = 0; i < fetchs.size(); i++) {
     output_names_[fetchs[i]->GetAttr<int>("col")] =
         fetchs[i]->Input("X").front();
   }
@@ -138,9 +141,6 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
 
 // 2. Create Instructs
 #ifdef LITE_WITH_OPENCL
-  using WaitListType =
-      std::unordered_map<decltype(static_cast<const void*>(nullptr)),
-                         std::shared_ptr<cl::Event>>;
   using OpenCLContext = Context<TargetType::kOpenCL>;
   std::unique_ptr<KernelContext> local_ctx(new KernelContext());
   local_ctx->As<OpenCLContext>().InitOnce();
@@ -182,58 +182,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
 }
 
 void LightPredictor::DequantizeWeight() {
-#define PROCESS_CONV2D_DATA()                                   \
-  for (int64_t i = 0; i < h; ++i) {                             \
-    for (int64_t j = 0; j < w; ++j) {                           \
-      fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \
-    }                                                           \
+#define PROCESS_CONV2D_DATA()                                             \
+  for (int64_t i = 0; i < ch; ++i) {                                      \
+    for (int64_t j = 0; j < offset; ++j) {                                \
+      fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \
+    }                                                                     \
   }
 
-#define PROCESS_FC_DATA()                           \
-  for (int i = 0; i < input_tensor->numel(); i++) { \
-    *fp_data = scale_list[0] * (*int_data);         \
-    ++fp_data;                                      \
-    ++int_data;                                     \
+#define PROCESS_FC_DATA()                                               \
+  for (int64_t i = 0; i < chin; i++) {                                  \
+    for (int64_t j = 0; j < chout; j++) {                               \
+      fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \
+    }                                                                   \
   }
 
+  auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) {
+    bool result = false;
+    if (op_desc->HasAttr("quantization_type")) {
+      std::string type = op_desc->GetAttr<std::string>("quantization_type");
+      result = (type == "post_weight_abs_max") ||
+               (type == "post_weight_channel_wise_abs_max");
+    } else {
+      result = op_desc->HasAttr("quantize_weight_bits");
+    }
+    return result;
+  };
+
   Tensor tmp_tensor;
-  CHECK(cpp_program_desc_.BlocksSize());
-  auto* main_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
-  for (size_t k = 0; k < main_block->OpsSize(); ++k) {
-    auto* op_desc = main_block->GetOp<cpp::OpDesc>(k);
-    if (op_desc->HasAttr("quantize_weight_bits")) {  //  weight quantized op
-      auto input_names = op_desc->input_vars();
-      for (auto& input_name : input_names) {
-        std::string input_scale_name = input_name + "_quant_scale";
-        if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
-          auto input_tensor =
-              scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
-          tmp_tensor.CopyDataFrom(*input_tensor);
-          auto scale_list =
-              op_desc->GetAttr<std::vector<float>>(input_scale_name);
-          int quantize_weight_bits =
-              op_desc->GetAttr<int>("quantize_weight_bits");
-          float* fp_data = input_tensor->mutable_data<float>();
-
-          std::string op_type = op_desc->Type();
-          if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
-            int64_t h = input_tensor->dims()[0];
-            int64_t w = input_tensor->numel() / h;
-            CHECK_EQ(scale_list.size(), h);
-            if (quantize_weight_bits == 8) {
-              const int8_t* int_data = tmp_tensor.data<int8_t>();
-              PROCESS_CONV2D_DATA()
-            } else {
-              const int16_t* int_data = tmp_tensor.data<int16_t>();
-              PROCESS_CONV2D_DATA()
-            }
-          } else if (op_type == "fc" || op_type == "mul") {
-            if (quantize_weight_bits == 8) {
-              const int8_t* int_data = tmp_tensor.data<int8_t>();
-              PROCESS_FC_DATA()
-            } else {
-              const int16_t* int_data = tmp_tensor.data<int16_t>();
-              PROCESS_FC_DATA()
+  for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
+    auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
+    for (size_t k = 0; k < block->OpsSize(); ++k) {
+      auto* op_desc = block->GetOp<cpp::OpDesc>(k);
+      if (is_weight_quantized_op(op_desc)) {
+        auto input_names = op_desc->input_vars();
+        for (auto& input_name : input_names) {
+          std::string input_scale_name = input_name + "_quant_scale";
+          if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
+            auto input_tensor =
+                scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
+            tmp_tensor.CopyDataFrom(*input_tensor);
+            auto scale_list =
+                op_desc->GetAttr<std::vector<float>>(input_scale_name);
+
+            int quantize_weight_bits =
+                op_desc->GetAttr<int>("quantize_weight_bits");
+            CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16);
+            float* fp_data = input_tensor->mutable_data<float>();
+
+            std::string op_type = op_desc->Type();
+            if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
+              int64_t ch = input_tensor->dims()[0];
+              int64_t offset = input_tensor->numel() / ch;
+              CHECK_EQ(scale_list.size(), ch);
+              if (quantize_weight_bits == 8) {
+                const int8_t* int_data = tmp_tensor.data<int8_t>();
+                PROCESS_CONV2D_DATA()
+              } else {
+                const int16_t* int_data = tmp_tensor.data<int16_t>();
+                PROCESS_CONV2D_DATA()
+              }
+            } else if (op_type == "fc" || op_type == "mul") {
+              int64_t chin = input_tensor->dims()[0];
+              int64_t chout = input_tensor->dims()[1];
+              CHECK_EQ(scale_list.size(), chout);
+              if (quantize_weight_bits == 8) {
+                const int8_t* int_data = tmp_tensor.data<int8_t>();
+                PROCESS_FC_DATA()
+              } else {
+                const int16_t* int_data = tmp_tensor.data<int16_t>();
+                PROCESS_FC_DATA()
+              }
             }
           }
         }
diff --git a/lite/api/light_api_test.cc b/lite/api/light_api_test.cc
index b49ff8b80c936b93acd630c6e0cde03df8b22ee4..08779c0b5c9802ebc5095241b2543d8724981dff 100644
--- a/lite/api/light_api_test.cc
+++ b/lite/api/light_api_test.cc
@@ -37,11 +37,11 @@ TEST(LightAPI, load) {
   const std::vector<std::string> inputs = predictor.GetInputNames();
 
   LOG(INFO) << "input size: " << inputs.size();
-  for (int i = 0; i < inputs.size(); i++) {
+  for (size_t i = 0; i < inputs.size(); i++) {
     LOG(INFO) << "inputnames: " << inputs[i];
   }
   const std::vector<std::string> outputs = predictor.GetOutputNames();
-  for (int i = 0; i < outputs.size(); i++) {
+  for (size_t i = 0; i < outputs.size(); i++) {
     LOG(INFO) << "outputnames: " << outputs[i];
   }
 
diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc
index 12559d171ff3df808cf252e8e09c652246902abf..33c0a94cf1a254e42c47aa462c5cfe12e386a87e 100644
--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
@@ -293,13 +293,13 @@ int main(int argc, char** argv) {
 
   std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
   std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
     input_shapes.push_back(get_shape(str_input_shapes[i]));
   }
   std::vector<std::string> str_input_shapes_0 =
       split_string(FLAGS_input_shape_0);
   std::vector<std::vector<int64_t>> input_shapes_0;
-  for (int i = 0; i < str_input_shapes_0.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes_0.size(); ++i) {
     input_shapes_0.push_back(get_shape(str_input_shapes_0[i]));
   }
 
diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc
index b0f7a0479f0db91b816838f9d0ee1cc31b9b232a..f61ed9b4c38fcc3a6fe33fd26d6d3a80edcb9373 100644
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -44,9 +44,15 @@ void OutputOptModel(const std::string& load_model_dir,
                     const std::vector<std::vector<int64_t>>& input_shapes) {
   lite_api::CxxConfig config;
   config.set_model_dir(load_model_dir);
+#ifdef LITE_WITH_X86
+  config.set_valid_places({Place{TARGET(kX86), PRECISION(kFloat)},
+                           Place{TARGET(kX86), PRECISION(kInt64)},
+                           Place{TARGET(kHost), PRECISION(kFloat)}});
+#else
   config.set_valid_places({
       Place{TARGET(kARM), PRECISION(kFloat)},
   });
+#endif
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
   // delete old optimized model
@@ -198,7 +204,7 @@ int main(int argc, char** argv) {
   LOG(INFO) << "input shapes: " << FLAGS_input_shape;
   std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
   std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
     LOG(INFO) << "input shape: " << str_input_shapes[i];
     input_shapes.push_back(get_shape(str_input_shapes[i]));
   }
diff --git a/lite/api/model_test_classify.cc b/lite/api/model_test_classify.cc
index 375d249476bf5323d69ea41c3f11d07e9c8bc711..5d2011e29bfdeb166ae1ad202d96a204893888b0 100644
--- a/lite/api/model_test_classify.cc
+++ b/lite/api/model_test_classify.cc
@@ -310,7 +310,7 @@ int main(int argc, char** argv) {
   LOG(INFO) << "input shapes: " << FLAGS_input_shape;
   std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
   std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
     LOG(INFO) << "input shape: " << str_input_shapes[i];
     input_shapes.push_back(get_shape(str_input_shapes[i]));
   }
diff --git a/lite/api/model_test_detection.cc b/lite/api/model_test_detection.cc
index f9be12b2c78c623a2b2c9852850576cc11815bd3..f059aca6330613f66fa93267c0c594cfba6d8833 100644
--- a/lite/api/model_test_detection.cc
+++ b/lite/api/model_test_detection.cc
@@ -114,7 +114,7 @@ void detect_object(const float* dout,
   }
   std::string name = FLAGS_out_txt + "_accu.txt";
   FILE* fp = fopen(name.c_str(), "w");
-  for (int i = 0; i < objects.size(); ++i) {
+  for (size_t i = 0; i < objects.size(); ++i) {
     Object object = objects.at(i);
     if (object.prob > thresh && object.x > 0 && object.y > 0 &&
         object.width > 0 && object.height > 0) {
@@ -324,7 +324,7 @@ int main(int argc, char** argv) {
   LOG(INFO) << "input shapes: " << FLAGS_input_shape;
   std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
   std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
     LOG(INFO) << "input shape: " << str_input_shapes[i];
     input_shapes.push_back(get_shape(str_input_shapes[i]));
   }
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
index 7a8cd7f1ef1234269c986b781f0546b26df53c4b..a6ad7cff6f234187770eccf1501378c04201b729 100644
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -104,13 +104,21 @@ std::vector<Place> ParserValidPlaces() {
       valid_places.emplace_back(
           TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
     } else if (target_repr == "x86") {
-      valid_places.emplace_back(TARGET(kX86));
+      valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kFloat)});
+      valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
     } else if (target_repr == "npu") {
       valid_places.emplace_back(TARGET(kNPU));
     } else if (target_repr == "xpu") {
       valid_places.emplace_back(TARGET(kXPU));
     } else if (target_repr == "mlu") {
       valid_places.emplace_back(TARGET(kMLU));
+    } else if (target_repr == "rknpu") {
+      valid_places.emplace_back(TARGET(kRKNPU));
+      valid_places.emplace_back(
+          TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
+    } else if (target_repr == "apu") {
+      valid_places.emplace_back(
+          Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
     } else {
       LOG(FATAL) << lite::string_format(
           "Wrong target '%s' found, please check the command flag "
@@ -187,6 +195,8 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                       "kFPGA",
                                       "kNPU",
                                       "kXPU",
+                                      "kRKNPU",
+                                      "kAPU",
                                       "kAny",
                                       "kUnk"};
   int maximum_optype_length = 0;
@@ -251,16 +261,16 @@ void PrintHelpInfo() {
       "        `--param_file=<param_path>`\n"
       "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
       "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
       "        `--record_tailoring_info=(true|false)`\n"
       "  Arguments of model checking and ops information:\n"
       "        `--print_all_ops=true`   Display all the valid operators of "
       "Paddle-Lite\n"
       "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
       "  Display valid operators of input targets\n"
       "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
       "  Display operators in the input model\n";
   std::cout << "opt version:" << opt_version << std::endl
             << help_info << std::endl;
diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc
index bd86f486248a2daccde13da078ae3860d8e31169..14c1ca4a4e9c19d2d3c27b783267682457eeddb2 100644
--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -63,6 +63,13 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
       valid_places_.emplace_back(TARGET(kNPU));
     } else if (target_repr == "xpu") {
       valid_places_.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "rknpu") {
+      valid_places_.emplace_back(TARGET(kRKNPU));
+      valid_places_.emplace_back(
+          TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
+    } else if (target_repr == "apu") {
+      valid_places_.emplace_back(
+          Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
     } else {
       LOG(FATAL) << lite::string_format(
           "Wrong target '%s' found, please check the command flag "
@@ -183,7 +190,7 @@ void OptBase::PrintHelpInfo() {
       "        `set_param_file(param_file_path)`\n"
       "        `set_model_type(protobuf|naive_buffer)`\n"
       "        `set_optimize_out(output_optimize_model_dir)`\n"
-      "        `set_valid_places(arm|opencl|x86|npu|xpu)`\n"
+      "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
       "        `run_optimize(false|true)`\n"
       "        `  ----fasle&true refer to whether to record ops info for "
       "tailoring lib, false by default`\n"
@@ -208,6 +215,8 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
                                                      "kFPGA",
                                                      "kNPU",
                                                      "kXPU",
+                                                     "kRKNPU",
+                                                     "kAPU",
                                                      "kAny",
                                                      "kUnk"};
   // Get the lengh of the first column: maximum length of the op_type
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index ce0f0e15d84835fab733a5114906e0a0df3a0064..79ab98da799a99540217d55e3d40b46800f17626 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -136,6 +136,9 @@ class LITE_API CxxConfig : public ConfigBase {
 #ifdef LITE_WITH_X86
   int x86_math_library_math_threads_ = 1;
 #endif
+#ifdef LITE_WITH_CUDA
+  bool multi_stream_{false};
+#endif
 #ifdef LITE_WITH_MLU
   lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
   int mlu_core_number_{1};
@@ -171,6 +174,10 @@ class LITE_API CxxConfig : public ConfigBase {
     return x86_math_library_math_threads_;
   }
 #endif
+#ifdef LITE_WITH_CUDA
+  void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
+  int multi_stream() const { return multi_stream_; }
+#endif
 
 #ifdef LITE_WITH_MLU
   // set MLU core version, which is used when compiling MLU kernels
diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc
index 9b8384f2823ee121aa8bb505dd135735d9f96774..832867df079efa1baebf08da4c0d8e37958460f1 100644
--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -36,11 +36,11 @@ TEST(CxxApi, run) {
 
   auto inputs = predictor->GetInputNames();
   LOG(INFO) << "input size: " << inputs.size();
-  for (int i = 0; i < inputs.size(); i++) {
+  for (size_t i = 0; i < inputs.size(); i++) {
     LOG(INFO) << "inputnames: " << inputs[i];
   }
   auto outputs = predictor->GetOutputNames();
-  for (int i = 0; i < outputs.size(); i++) {
+  for (size_t i = 0; i < outputs.size(); i++) {
     LOG(INFO) << "outputnames: " << outputs[i];
   }
   auto input_tensor = predictor->GetInputByName(inputs[0]);
diff --git a/lite/api/paddle_lite_factory_helper.h b/lite/api/paddle_lite_factory_helper.h
index e99127e233bc4adf159a6a567dfb15f6fd784a27..9dc5c9e857243ecb57f785737b00929e36c5d83c 100644
--- a/lite/api/paddle_lite_factory_helper.h
+++ b/lite/api/paddle_lite_factory_helper.h
@@ -18,20 +18,21 @@
  */
 #pragma once
 
-#define USE_LITE_OP(op_type__)                                   \
-  extern int touch_op_##op_type__();                             \
-  int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \
-      touch_op_##op_type__();
+// some platform-independent defintion
+#include "lite/utils/macros.h"
+
+#define USE_LITE_OP(op_type__)       \
+  extern int touch_op_##op_type__(); \
+  int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__();
 
 #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
   extern int touch_##op_type__##target__##precision__##layout__##alias__();  \
   int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \
-      __attribute__((unused)) =                                              \
-          touch_##op_type__##target__##precision__##layout__##alias__();
+      UNUSED = touch_##op_type__##target__##precision__##layout__##alias__();
 
-#define USE_MIR_PASS(name__)                                   \
-  extern bool mir_pass_registry##name__##_fake();              \
-  static bool mir_pass_usage##name__ __attribute__((unused)) = \
+#define USE_MIR_PASS(name__)                      \
+  extern bool mir_pass_registry##name__##_fake(); \
+  static bool mir_pass_usage##name__ UNUSED =     \
       mir_pass_registry##name__##_fake();
 
 #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index aceb047b64f54ac18ac492ef495d32c3180ad4b4..3cef9563d89cd5b21dbdcb0c4ccf1504e7d311b3 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -72,7 +72,9 @@ const std::string& TargetToStr(TargetType target) {
                                               "npu",
                                               "xpu",
                                               "bm",
-                                              "mlu"};
+                                              "mlu",
+                                              "rknpu",
+                                              "apu"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -112,8 +114,10 @@ const std::string& TargetRepr(TargetType target) {
                                               "kFPGA",
                                               "kNPU",
                                               "kXPU",
+                                              "kBM",
                                               "kMLU",
-                                              "kBM"};
+                                              "kRKNPU",
+                                              "kAPU"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -156,6 +160,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                                TARGET(kXPU),
                                                TARGET(kBM),
                                                TARGET(kMLU),
+                                               TARGET(kAPU),
                                                TARGET(kFPGA)});
   if (target == TARGET(kAny)) {
     return valid_set;
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index f57b9832f2b35fc3db74232192bd397ec8b4930c..7066656f18ec0693048223f5f1201e77a1b0a37d 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -49,13 +49,15 @@ enum class TargetType : int {
   kCUDA = 3,
   kARM = 4,
   kOpenCL = 5,
+  kAny = 6,  // any target
   kFPGA = 7,
   kNPU = 8,
   kXPU = 9,
   kBM = 10,
   kMLU = 11,
-  kAny = 6,  // any target
-  NUM = 12,  // number of fields.
+  kRKNPU = 12,
+  kAPU = 13,
+  NUM = 14,  // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 219952bd2aa440c81b116d9ae8aaba0920268eb5..82cd7f3d8da5eb4f00c9069731960a81ef9fe87d 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -42,12 +42,14 @@ USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
 USE_MIR_PASS(type_layout_cast_preprocess_pass);
 USE_MIR_PASS(memory_optimize_pass);
+USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(mlu_subgraph_pass);
 USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
+USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
diff --git a/lite/api/python/CMakeLists.txt b/lite/api/python/CMakeLists.txt
index ba0c6eb2404ce1ffc2ad5950ee5a3476d42f01b8..5dfecf8c619d8cf9be7a03fa46b4e86a6e641a29 100644
--- a/lite/api/python/CMakeLists.txt
+++ b/lite/api/python/CMakeLists.txt
@@ -17,8 +17,12 @@ execute_process(
   OUTPUT_VARIABLE PADDLE_LITE_COMMIT
   OUTPUT_STRIP_TRAILING_WHITESPACE
 )
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
-    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-
+if(APPLE)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup_mac.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+else()
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+endif()
 add_subdirectory(pybind)
 #add_subdirectory(interface)
diff --git a/lite/api/python/__init__.py b/lite/api/python/__init__.py
index abf198b97e6e818e1fbe59006f98492640bcee54..72a75d9caaa79fa96e52e8603ae6886aac341009 100644
--- a/lite/api/python/__init__.py
+++ b/lite/api/python/__init__.py
@@ -11,3 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import os 
+import sys
+
+if os.name =='nt':
+    current_path = os.path.abspath(os.path.dirname(__file__))
+    third_lib_path = current_path + os.sep + 'libs'
+    os.environ['path'] =  third_lib_path+ ';' + os.environ['path']
+    sys.path.insert(0, third_lib_path)
diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt
index b1de18d50c1582b0f872ad38d24939665ab1d3b0..fe4cdb5a73d62afa98fb8c343e8a6a20388e293b 100644
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -3,7 +3,14 @@ if (NOT LITE_ON_TINY_PUBLISH)
    set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
 endif()
 
-lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+if(WIN32)
+   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+   target_link_libraries(lite_pybind ${os_dependency_modules})
+else()
+   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+endif(WIN32)
+
 if (LITE_ON_TINY_PUBLISH)
    set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
 endif()
diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc
index 5512e7bc438eddd6bcd9c8f792fc8507b03bf800..06d1c607fd761f9f6e58a4c5779e2c3cb9f4e6b3 100644
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -183,6 +183,8 @@ void BindLitePlace(py::module *m) {
       .value("FPGA", TargetType::kFPGA)
       .value("NPU", TargetType::kNPU)
       .value("MLU", TargetType::kMLU)
+      .value("RKNPU", TargetType::kRKNPU)
+      .value("APU", TargetType::kAPU)
       .value("Any", TargetType::kAny);
 
   // PrecisionType
diff --git a/lite/api/python/setup.py.in b/lite/api/python/setup.py.in
index 79028fb7493bf55eab74aa76ee51ac79f418ba0a..b04a6077f5aafecf76fed0b0dee5c56919b9302e 100644
--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
@@ -34,20 +34,27 @@ else:
 
 # core lib of paddlelite is stored as lite.so
 LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
-PACKAGE_DATA = {'paddlelite': ['lite.so']}
+PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']}
 # put all thirdparty libraries in paddlelite.libs
 PACKAGE_DATA['paddlelite.libs'] = []
 LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
     shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
-    PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
-
+    if os.name != 'nt':
+        PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
+    else:
+        PACKAGE_DATA['paddlelite.libs'] += ['libiomp5md.dll', 'mklml.dll']
+        shutil.copy('${MKLML_SHARED_LIB_DEPS}', LIB_PATH)
+        PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll']
 # link lite.so to paddlelite.libs
-COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
-/inference_lite_lib/python/install/lite/lite.so"
-if os.system(COMMAND) != 0:
-    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+if os.name != 'nt':
+    COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
+    /inference_lite_lib/python/install/lite/lite.so"
+    if os.system(COMMAND) != 0:
+        raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+
+  
 
 # remove unused paddle/libs/__init__.py
 if os.path.isfile(LIB_PATH+'/__init__.py'):
@@ -61,6 +68,14 @@ PACKAGE_DIR = {
     'paddlelite': LITE_PATH
 }
 
+if os.name == 'nt':
+    # fix the path separator under windows
+    fix_package_dir = {}
+    for k, v in PACKAGE_DIR.items():
+        fix_package_dir[k] = v.replace('/', '\\')
+    PACKAGE_DIR = fix_package_dir
+
+
 setup(
     name='paddlelite',
     version=PADDLELITE_VERSION,
diff --git a/lite/api/python/setup_mac.py.in b/lite/api/python/setup_mac.py.in
new file mode 100644
index 0000000000000000000000000000000000000000..c8dfe2cc5c13b3105fc1aed404676eefd40877e8
--- /dev/null
+++ b/lite/api/python/setup_mac.py.in
@@ -0,0 +1,73 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# module of pack whl installer for Paddle-lite
+
+import shutil
+import os
+from setuptools import setup, Distribution
+
+
+class BinaryDistribution(Distribution):
+    'binary distribution'
+    def has_ext_modules(foo):
+        return True
+
+
+# get paddle-lite version, if it's not based on a release tag, we use commit id instead
+PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
+PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
+if PADDLELITE_TAG == "":
+    PADDLELITE_VERSION = PADDLELITE_COMMITE
+else:
+    PADDLELITE_VERSION = PADDLELITE_TAG
+
+# core lib of paddlelite is stored as lite.so
+LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
+PACKAGE_DATA = {'paddlelite': ['lite.so']}
+# put all thirdparty libraries in paddlelite.libs
+PACKAGE_DATA['paddlelite.libs'] = []
+LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
+
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
+    shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
+    PACKAGE_DATA['paddlelite.libs'] += ['libmklml.dylib', 'libiomp5.dylib']
+
+# link lite.so to paddlelite.libs
+COMMAND = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}\
+/inference_lite_lib/python/install/lite/lite.so"
+if os.system(COMMAND) != 0:
+    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+
+# remove unused paddle/libs/__init__.py
+if os.path.isfile(LIB_PATH+'/__init__.py'):
+    os.remove(LIB_PATH+'/__init__.py')
+
+# set dir path of each package
+PACKAGE_DIR = {
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddlelite.libs': LIB_PATH,
+    'paddlelite': LITE_PATH
+}
+
+setup(
+    name='paddlelite',
+    version=PADDLELITE_VERSION,
+    description='Paddle-Lite Library',
+    packages=['paddlelite', 'paddlelite.libs'],
+    package_dir=PACKAGE_DIR,
+    package_data=PACKAGE_DATA,
+    distclass=BinaryDistribution
+)
diff --git a/lite/api/test_googlenet_lite.cc b/lite/api/test_googlenet_lite.cc
index 8ff7a49af9cbce09d205bb8633a913410beb91c3..f20714f096756da63bdb99c5bcf57b225658b096 100644
--- a/lite/api/test_googlenet_lite.cc
+++ b/lite/api/test_googlenet_lite.cc
@@ -38,7 +38,7 @@ TEST(CXXApi, test_lite_googlenet) {
   input_tensor->Resize(input_shape);
   auto* data = input_tensor->mutable_data<float>();
   int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
     input_num *= input_shape[i];
   }
   for (int i = 0; i < input_num; i++) {
@@ -69,7 +69,7 @@ TEST(CXXApi, test_lite_googlenet) {
   for (size_t i = 0; i < results.size(); ++i) {
     EXPECT_NEAR(out->data<float>()[i * 51], results[i], 1e-5);
   }
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
   ASSERT_EQ(out->shape()[0], 1);
   ASSERT_EQ(out->shape()[1], 1000);
 }
diff --git a/lite/api/test_helper.h b/lite/api/test_helper.h
index a17fc331310cfe17ec36be504b94ddacc724e90f..fa6e20230d68c73b0720606816a4594077278d56 100644
--- a/lite/api/test_helper.h
+++ b/lite/api/test_helper.h
@@ -15,7 +15,12 @@
 #pragma once
 
 #include <gflags/gflags.h>
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#include <windows.h>
+#include "lite/backends/x86/port.h"
+#endif
 #include <time.h>
 #include <cmath>
 
diff --git a/lite/api/test_inceptionv4_lite_x86.cc b/lite/api/test_inceptionv4_lite_x86.cc
index e986784809951390889e17f766302fc5ea459465..00f775ddb7e7bf2d2f23c34ce19e576a4d2d27ed 100644
--- a/lite/api/test_inceptionv4_lite_x86.cc
+++ b/lite/api/test_inceptionv4_lite_x86.cc
@@ -38,7 +38,7 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) {
   input_tensor->Resize(input_shape);
   auto* data = input_tensor->mutable_data<float>();
   int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
     input_num *= input_shape[i];
   }
   for (int i = 0; i < input_num; i++) {
@@ -69,13 +69,13 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) {
        0.0010612885,  0.00089107914, 0.0010112736,  0.00097655767}));
 
   auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
   ASSERT_EQ(out->shape()[0], 1);
   ASSERT_EQ(out->shape()[1], 1000);
 
   int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
       EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                   results[i][j],
                   1e-6);
diff --git a/lite/api/test_mobilenetv1_lite_x86.cc b/lite/api/test_mobilenetv1_lite_x86.cc
index 67dc1b2436988c7d0d853c945fecce27ef2d329f..8a7547b9031d0723c528e7dd6e8d7e3fb6201b7d 100644
--- a/lite/api/test_mobilenetv1_lite_x86.cc
+++ b/lite/api/test_mobilenetv1_lite_x86.cc
@@ -38,7 +38,7 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
   input_tensor->Resize(input_shape);
   auto* data = input_tensor->mutable_data<float>();
   int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
     input_num *= input_shape[i];
   }
   for (int i = 0; i < input_num; i++) {
@@ -68,13 +68,13 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
        0.0048292773,  0.0013995157,  0.0018453331,  0.0002428986,
        0.00020211363, 0.00013668182, 0.0005855956,  0.00025901722}));
   auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
   ASSERT_EQ(out->shape()[0], 1);
   ASSERT_EQ(out->shape()[1], 1000);
 
   int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
       EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                   results[i][j],
                   1e-6);
diff --git a/lite/api/test_mobilenetv2_lite_x86.cc b/lite/api/test_mobilenetv2_lite_x86.cc
index 95e88abcc8e59c6808ea2dc44cf7d1bdd53ac9d0..92c8182f7330a76bf55cf34fbb9e4fdba1fa2fc6 100644
--- a/lite/api/test_mobilenetv2_lite_x86.cc
+++ b/lite/api/test_mobilenetv2_lite_x86.cc
@@ -39,7 +39,7 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
   input_tensor->Resize(input_shape);
   auto* data = input_tensor->mutable_data<float>();
   int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
     input_num *= input_shape[i];
   }
   for (int i = 0; i < input_num; i++) {
@@ -69,13 +69,13 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
        0.0070957416,  0.0016094646,  0.0018807327,  0.00010506048,
        6.823785e-05,  0.00012269315, 0.0007806194,  0.00022354358}));
   auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
   ASSERT_EQ(out->shape()[0], 1);
   ASSERT_EQ(out->shape()[1], 1000);
 
   int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
       EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                   results[i][j],
                   1e-6);
diff --git a/lite/api/test_resnet50_lite_x86.cc b/lite/api/test_resnet50_lite_x86.cc
index 3f9b59d714de611ef0a84cfc3b283d0dddd5c294..b185159801b6264555367b41f7def1bd0e7a5a3f 100644
--- a/lite/api/test_resnet50_lite_x86.cc
+++ b/lite/api/test_resnet50_lite_x86.cc
@@ -38,7 +38,7 @@ TEST(Resnet50, test_resnet50_lite_x86) {
   input_tensor->Resize(input_shape);
   auto* data = input_tensor->mutable_data<float>();
   int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
     input_num *= input_shape[i];
   }
   for (int i = 0; i < input_num; i++) {
@@ -69,13 +69,13 @@ TEST(Resnet50, test_resnet50_lite_x86) {
        0.006387163,   0.0037145028,  0.0012812682,  0.00045948103,
        0.00013535398, 0.0002483765,  0.00076759676, 0.0002773295}));
   auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
   ASSERT_EQ(out->shape()[0], 1);
   ASSERT_EQ(out->shape()[1], 1000);
 
   int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
       EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                   results[i][j],
                   1e-6);
diff --git a/lite/api/transform_test.cc b/lite/api/transform_test.cc
index e1c315f4a63ffd3ed8f51fa4b73ac88b50835cab..3cd8416d5e2293642abc68e457465c8a836f790b 100644
--- a/lite/api/transform_test.cc
+++ b/lite/api/transform_test.cc
@@ -232,8 +232,8 @@ void TestModel(const std::vector<Place>& valid_places,
     for (int i = 0; i < outs->numel(); ++i) {
       LOG(INFO) << o_data[i];
     }
-    for (int i = 0; i < lod.size(); ++i) {
-      for (int j = 0; j < lod[i].size(); ++j) {
+    for (size_t i = 0; i < lod.size(); ++i) {
+      for (size_t j = 0; j < lod[i].size(); ++j) {
         LOG(INFO) << lod[i][j];
       }
     }
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index fb459ae3621d1281f0a2433ca6b237a165d078a1..7f0d53f976ace17ee8d95e62e62d56f5cb974881 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -8,3 +8,5 @@ add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
 add_subdirectory(bm)
+add_subdirectory(apu)
+add_subdirectory(rknpu)
diff --git a/lite/backends/apu/CMakeLists.txt b/lite/backends/apu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..68d77a401f541fa56b2b53ea9a99619f1baafb42
--- /dev/null
+++ b/lite/backends/apu/CMakeLists.txt
@@ -0,0 +1,5 @@
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+
+lite_cc_library(device_apu SRCS device.cc)
diff --git a/lite/backends/apu/device.cc b/lite/backends/apu/device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27cde9f6efd45a20649b8ff3d4f5ff3b2220aa2d
--- /dev/null
+++ b/lite/backends/apu/device.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/apu/device.h"
+#include <dlfcn.h>
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace apu {
+
+inline void* LoadFunc(void* libHandle, const char* name) {
+  CHECK(libHandle != nullptr);
+  CHECK(name != nullptr);
+  void* fn = dlsym(libHandle, name);
+  if (fn == nullptr) {
+    LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
+                 << "] Because " << dlerror();
+  }
+  return fn;
+}
+
+NeuronCompilation* Device::Build(void* libHandle, NeuronModel* model) {
+  typedef int (*NeuronCompilation_create)(NeuronModel * model,
+                                          NeuronCompilation * *compilation);
+  typedef void (*NeuronCompilation_free)(NeuronCompilation * compilation);
+  typedef int (*NeuronCompilation_finish)(NeuronCompilation * compilation);
+
+#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
+  FUNC_NAME VARIABLE_NAME =                                 \
+      reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_create, neuron_compilation_create)
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_free, neuron_compilation_free)
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_finish, neuron_compilation_finish)
+#undef LOAD_FUNCTIONS
+
+  int neuron_errCode = 0;
+  NeuronCompilation* compilation = NULL;
+
+  VLOG(3) << "[APU] Compile model";
+
+  neuron_errCode = (*neuron_compilation_create)(model, &compilation);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "[APU] create compile failed! " << neuron_errCode;
+    return nullptr;
+  }
+
+  neuron_errCode = (*neuron_compilation_finish)(compilation);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "[APU] compile failed! " << neuron_errCode;
+    return nullptr;
+  }
+
+  VLOG(3) << "[APU] Build done";
+  return compilation;
+}
+
+}  // namespace apu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/apu/device.h b/lite/backends/apu/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..f332512bcb2d5ec9558be0be5694a0623560494c
--- /dev/null
+++ b/lite/backends/apu/device.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "NeuronAdapter.h"  // NOLINT
+
+namespace paddle {
+namespace lite {
+namespace apu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {}
+
+  NeuronCompilation* Build(void* libHandle, NeuronModel* model);
+};
+
+}  // namespace apu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc
index 26e63e23f6acb761b61b397bb881d425e3442468..1d01642100109d14a413ad5e274606c88bf0005a 100644
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -744,6 +744,15 @@ void act_reciprocal<float>(const float* din,
   }
 }
 
+template <>
+void act_abs<float>(const float* din, float* dout, int size, int threads) {
+  for (int i = 0; i < size; ++i) {
+    dout[0] = (din[0] > 0 ? din[0] : -din[0]);
+    din++;
+    dout++;
+  }
+}
+
 #ifdef LITE_WITH_TRAIN
 template <>
 void act_square_grad(const float* din,
diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h
index ca6b146442a3ec324a9bd244ee4ce6ad0601d4d7..50f60f300bbab9b9f0bcad222f31699b7bfadeab 100644
--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -83,6 +83,9 @@ void act_hard_swish(const T* din,
 template <typename T>
 void act_reciprocal(const T* din, T* dout, int size, int threads);
 
+template <typename T>
+void act_abs(const T* din, T* dout, int size, int threads);
+
 #ifdef LITE_WITH_TRAIN
 template <typename T>
 void act_square_grad(
diff --git a/lite/backends/arm/math/concat.cc b/lite/backends/arm/math/concat.cc
index 65f93453388d7f41d73669f583d189bec9035bb5..e54d70ffbb119d0a91b82f67b77c9d778dea17bf 100644
--- a/lite/backends/arm/math/concat.cc
+++ b/lite/backends/arm/math/concat.cc
@@ -16,46 +16,3 @@
 #include <algorithm>
 #include <limits>
 #include <memory>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void concat_func(const std::vector<lite::Tensor *> &input,
-                 const int axis,
-                 lite::Tensor *output) {
-  int64_t concat_input_size = 1;
-  int64_t num_cancats = 1;
-  auto dim_0 = input[0]->dims();
-  size_t num = input.size();
-  for (int i = axis + 1; i < dim_0.size(); i++) {
-    concat_input_size *= dim_0[i];
-  }
-  for (int i = 0; i < axis; i++) {
-    num_cancats *= dim_0[i];
-  }
-  float *dst_ptr = output->mutable_data<float>();
-  const int out_concat_axis = output->dims()[axis];
-  int64_t offset_concat_axis = 0;
-  int64_t out_sum = out_concat_axis * concat_input_size;
-  for (int n = 0; n < num; n++) {
-    auto dims = input[n]->dims();
-    const float *src_ptr = input[n]->data<float>();
-    int64_t in_concat_axis = dims[axis];
-    float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
-    int64_t in_sum = in_concat_axis * concat_input_size;
-    for (int i = 0; i < num_cancats; i++) {
-      std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
-      dout_ptr += out_sum;
-      src_ptr += in_sum;
-    }
-    offset_concat_axis += in_concat_axis;
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/concat.h b/lite/backends/arm/math/concat.h
index 4c6159e9e09b66edde812e5098e1263963f3e4da..44e8bf73e220f94dca4ba6713debfae77029867a 100644
--- a/lite/backends/arm/math/concat.h
+++ b/lite/backends/arm/math/concat.h
@@ -25,9 +25,39 @@ namespace lite {
 namespace arm {
 namespace math {
 
-void concat_func(const std::vector<lite::Tensor *> &input,
+template <typename T>
+void concat_func(const std::vector<lite::Tensor*>& input,
                  const int axis,
-                 lite::Tensor *output);
+                 lite::Tensor* output) {
+  size_t num = input.size();
+  auto dim_0 = input[0]->dims();
+  int64_t concat_input_size = 1;
+  int64_t num_cancats = 1;
+  for (int i = axis + 1; i < dim_0.size(); i++) {
+    concat_input_size *= dim_0[i];
+  }
+  for (int i = 0; i < axis; i++) {
+    num_cancats *= dim_0[i];
+  }
+
+  auto* dst_ptr = output->mutable_data<T>();
+  const int out_concat_axis = output->dims()[axis];
+  int64_t offset_concat_axis = 0;
+  int64_t out_sum = out_concat_axis * concat_input_size;
+  for (int n = 0; n < num; n++) {
+    auto dims = input[n]->dims();
+    auto* src_ptr = input[n]->data<T>();
+    int64_t in_concat_axis = dims[axis];
+    auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
+    int64_t in_sum = in_concat_axis * concat_input_size;
+    for (int i = 0; i < num_cancats; i++) {
+      std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum);
+      dout_ptr += out_sum;
+      src_ptr += in_sum;
+    }
+    offset_concat_axis += in_concat_axis;
+  }
+}
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/arm/math/reduce_mean.cc b/lite/backends/arm/math/reduce_mean.cc
index 56104550d8d68e53ad9a2ac3148887d67480d6f6..a84eef2970b2837159609c1ded1ca0d9991ccfc6 100644
--- a/lite/backends/arm/math/reduce_mean.cc
+++ b/lite/backends/arm/math/reduce_mean.cc
@@ -198,6 +198,23 @@ void reduce_mean_hw<float>(const float* src,
   reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
 }
 
+template <>
+void mean_grad<float>(const float* out_grad, float* in_grad, int size) {
+  float grad = out_grad[0] / size;
+  float32x4_t grad_v = vdupq_n_f32(grad);
+  int loop = size >> 2;
+  int remain = size & 3;
+
+#pragma omp parallel for
+  for (int i = 0; i < loop; ++i) {
+    vst1q_f32(in_grad, grad_v);
+    in_grad += 4;
+  }
+  for (int i = 0; i < remain; ++i) {
+    in_grad[i] = grad;
+  }
+}
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/reduce_mean.h b/lite/backends/arm/math/reduce_mean.h
index 277ed209c058b5b4be76ce18a00683610e6afb7a..aaa9ff42c18d0cfa6a7cf11408dfba06a9444adc 100644
--- a/lite/backends/arm/math/reduce_mean.h
+++ b/lite/backends/arm/math/reduce_mean.h
@@ -83,6 +83,9 @@ void reduce_mean_all(const T* src,
                      int height_in,
                      int width_in);
 
+template <typename T>
+void mean_grad(const T* out_grad, T* in_grad, int size);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt
index 35f5f0ce2d93db59cbb856d8008e6f3138633e42..0689bb706ab3bac4b8b97059017181ef24dd8ee4 100644
--- a/lite/backends/cuda/CMakeLists.txt
+++ b/lite/backends/cuda/CMakeLists.txt
@@ -5,5 +5,7 @@ get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
 
 nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps})
 nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps})
+
+lite_cc_library(cuda_context SRCS context.cc DEPS device_info)
  
 add_subdirectory(math)
diff --git a/lite/backends/cuda/context.cc b/lite/backends/cuda/context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4bac4c442c28848d38bd434d045c7888a1a92ac8
--- /dev/null
+++ b/lite/backends/cuda/context.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/context.h"
+
+namespace paddle {
+namespace lite {}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/context.h b/lite/backends/cuda/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bed30a9603c6f6a48169ae31d66c989bd891836
--- /dev/null
+++ b/lite/backends/cuda/context.h
@@ -0,0 +1,170 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/cuda/blas.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+
+template <TargetType Type>
+class Context;
+
+using CUDAContext = Context<TargetType::kCUDA>;
+
+// Only works with CUDA kernels.
+template <>
+class Context<TargetType::kCUDA> {
+ public:
+  typename Env<TargetType::kCUDA>::Devs& devs =
+      Env<TargetType::kCUDA>::Global();
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {
+    if (devs.size() > 0) {
+      cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
+    } else {
+      LOG(INFO) << "No cuda device(s) found, CUDAContext init failed.";
+    }
+  }
+  void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
+    CHECK_GT(devs.size(), 0UL)
+        << "Env is not initialized or current target is not exit!";
+    if (dev_id >= static_cast<int>(devs.size())) {
+      LOG(WARNING) << "device index exceeds the number of devices, set to "
+                      "default device(0)!";
+      device_id_ = 0;
+    } else {
+      device_id_ = dev_id;
+    }
+    if (io_stream_id >= devs[dev_id].max_stream()) {
+      LOG(WARNING) << "data stream index exceeds the maximum stream number, "
+                      "set to default stream(0)!";
+      io_stream_id = 0;
+    }
+    if (exec_stream_id >= devs[dev_id].max_stream()) {
+      LOG(WARNING) << "exec stream index exceeds the maximum stream number, "
+                      "set to default stream(0)!";
+      exec_stream_id = 0;
+    }
+
+    exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id];
+    io_stream_ = devs[dev_id].io_streams()[io_stream_id];
+
+    exec_stream_id_ = exec_stream_id;
+    io_stream_id_ = io_stream_id;
+    need_sync_ = false;
+  }
+  void CopySharedTo(CUDAContext* ctx) {
+    CHECK(ctx);
+    CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
+    ctx->cublas_fp32_ = cublas_fp32_;
+  }
+
+  const cudaStream_t& exec_stream() const { return exec_stream_; }
+  void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
+
+  const cudaStream_t& io_stream() const { return io_stream_; }
+  void SetIoStream(cudaStream_t stream) { io_stream_ = stream; }
+
+  std::shared_ptr<cuda::Blas<float>> cublas_fp32() { return cublas_fp32_; }
+  void SetCuBlasFP32(std::shared_ptr<cuda::Blas<float>> cublas_fp32) {
+    cublas_fp32_ = cublas_fp32;
+  }
+
+  const std::vector<cudaEvent_t>& input_events() { return input_events_; }
+  void SetInputEvents(const std::vector<cudaEvent_t>& input_events) {
+    input_events_.clear();
+    input_events_.assign(input_events.begin(), input_events.end());
+  }
+
+  const std::vector<cudaEvent_t>& output_events() { return output_events_; }
+  void SetOutputEvents(const std::vector<cudaEvent_t>& output_events) {
+    output_events_.clear();
+    output_events_.assign(output_events.begin(), output_events.end());
+  }
+
+  std::vector<cudaStream_t> all_exec_streams() {
+    int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
+    return devs[dev_id].exec_streams();
+  }
+
+  void SetSyncStreams(const std::vector<int>& nums) {
+    sync_streams_.clear();
+    std::vector<cudaStream_t> exec_streams = all_exec_streams();
+    for (size_t i = 0; i < nums.size(); ++i) {
+      CHECK(nums[i] >= 0 && nums[i] < static_cast<int>(exec_streams.size()))
+          << "streams id is not valid";
+      sync_streams_.push_back(exec_streams[nums[i]]);
+    }
+    InitSyncEvents(nums.size());
+  }
+
+  void InitSyncEvents(const int num) {
+    sync_events_.clear();
+    for (int i = 0; i < num; ++i) {
+      cudaEvent_t eve;
+      TargetWrapperCuda::CreateEventWithFlags(&eve);
+      sync_events_.push_back(eve);
+    }
+  }
+
+  void SetNeedSync(bool sync) { need_sync_ = sync; }
+  bool need_sync() const { return need_sync_; }
+
+  void Sync() {
+    CHECK_EQ(sync_streams_.size(), sync_events_.size());
+    for (size_t i = 0; i < sync_events_.size(); ++i) {
+      TargetWrapperCuda::RecordEvent(sync_events_[i], sync_streams_[i]);
+      TargetWrapperCuda::StreamSync(exec_stream_, sync_events_[i]);
+    }
+  }
+
+  std::string name() const { return "CUDAContext"; }
+
+  CUDAContext& operator=(const CUDAContext& context) {
+    this->Init(
+        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
+    cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
+    return *this;
+  }
+
+ private:
+  int device_id_;
+  // overall information
+  int exec_stream_id_;
+  int io_stream_id_;
+  cudaStream_t exec_stream_;
+  cudaStream_t io_stream_;
+
+  // not thread-safe, should allocate for each thread.
+  std::shared_ptr<cuda::Blas<float>> cublas_fp32_;
+
+  // kernel information
+  std::vector<cudaEvent_t> input_events_;
+  std::vector<cudaEvent_t> output_events_;
+  // multi stream sync.
+  std::vector<cudaStream_t> sync_streams_;
+  std::vector<cudaEvent_t> sync_events_;
+  bool need_sync_;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc
index 0edb83acc4772b2f878b22f2ea16b3175b14a7ba..eff959d992200592c21a024f56713b9abb4b87fb 100644
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -58,7 +58,7 @@ void CLContext::AddKernel(const std::string &kernel_name,
   auto program = GetProgram(file_name, options);
   VLOG(3) << " --- end get program --- ";
   VLOG(3) << " --- to create kernel: " << kernel_name << " --- ";
-  std::unique_ptr<cl::Kernel> kernel(
+  std::shared_ptr<cl::Kernel> kernel(
       new cl::Kernel(program, kernel_name.c_str(), &status));
   CL_CHECK_FATAL(status);
   VLOG(3) << " --- end create kernel --- ";
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
index 586dc3df1267e47c6cdaad1d362cd9ed2df2770e..41059a0d42a95bbffed4c41611b9f3b8ac60861c 100644
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -29,13 +29,14 @@ class CLContext {
  public:
   ~CLContext() {
     for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
-      clReleaseKernel(kernels_[kidx]->get());
+      // Note(ysh329): Don't need `clReleaseKernel`
       kernels_[kidx].reset();
     }
     kernels_.clear();
     kernel_offset_.clear();
     for (auto &p : programs_) {
-      clReleaseProgram(p.second->get());
+      // Note(ysh329): Dont't need `clReleaseProgram`
+      p.second.reset();
     }
     programs_.clear();
     LOG(INFO) << "release cl::Program, cl::Kernel finished.";
@@ -66,9 +67,10 @@ class CLContext {
                                 int divitor = 2);
   //  cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
   //                                   size_t max_work_size);
+
  private:
   std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
-  std::vector<std::unique_ptr<cl::Kernel>> kernels_;
+  std::vector<std::shared_ptr<cl::Kernel>> kernels_;
   std::map<std::string, int> kernel_offset_;
 };
 
diff --git a/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl
index 9427692f1267d363222295b33b6834e28517d0a4..515bf57487ffd93959929ea93f76b0fdd888c4a5 100644
--- a/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl
@@ -54,10 +54,10 @@ __kernel void bilinear_interp(__read_only image2d_t input,
     if (ceil_h > in_dims_h - 1) {
         ceil_h = in_dims_h- 1;
     }
-    float wight0_w = center_w - floor_w;
-    float wight0_h = center_h - floor_h;
-    float wight1_w = 1.0 - wight0_w;
-    float wight1_h = 1.0 - wight0_h;
+    CL_DTYPE wight0_w = center_w - floor_w;
+    CL_DTYPE wight0_h = center_h - floor_h;
+    CL_DTYPE wight1_w = 1.0 - wight0_w;
+    CL_DTYPE wight1_h = 1.0 - wight0_h;
 
     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                             CLK_ADDRESS_CLAMP |
@@ -92,5 +92,6 @@ __kernel void bilinear_interp(__read_only image2d_t input,
     CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h
             + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h;
 
+
     WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out);
 }
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index 8a6b026367986548b017aee263a70d4df33381b5..d5b2d70b09a84cb405c0e7c8f2b55f4254eb7f64 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -29,12 +29,12 @@ CLRuntime::~CLRuntime() {
     command_queue_->flush();
     command_queue_->finish();
   }
-  // For controlling the destruction order:
+  // For controlling the destruction order
   command_queue_.reset();
   context_.reset();
   device_.reset();
   platform_.reset();
-  LOG(INFO) << "release ~CLRuntime() ";
+  device_info_.clear();
 }
 
 bool CLRuntime::Init() {
diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h
index 2a8996b066a480d9c0a6db67fa5fd60142885046..503b3a011642a8e018781c08647a958c521e6fac 100644
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -55,7 +55,7 @@ class CLRuntime {
   std::map<std::string, size_t>& GetDeviceInfo();
 
  private:
-  CLRuntime() = default;
+  CLRuntime() { Init(); }
 
   ~CLRuntime();
 
diff --git a/lite/backends/rknpu/CMakeLists.txt b/lite/backends/rknpu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cec60c80759cfc02e25a82eb795746c8b93e7cfe
--- /dev/null
+++ b/lite/backends/rknpu/CMakeLists.txt
@@ -0,0 +1,5 @@
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+
+lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs})
diff --git a/lite/backends/rknpu/device.cc b/lite/backends/rknpu/device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b486259b3b328713062648df445f94735ae6380
--- /dev/null
+++ b/lite/backends/rknpu/device.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/rknpu/device.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace rknpu {
+
+std::unique_ptr<rk::nn::Exection> Device::Build(
+    std::string& model_name,                                   // NOLINT
+    rk::nn::Graph* rk_graph,                                   // NOLINT
+    std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes,  // NOLINT
+    std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes  // NOLINT
+    ) {
+  VLOG(3) << "[RKNPU] Build model";
+
+  rk_graph->SetInputsOutputs(input_nodes, output_nodes);
+
+  std::unique_ptr<rk::nn::Exection> exector =
+      std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(rk_graph));
+
+  exector->Build();
+
+  return exector;
+}
+
+}  // namespace rknpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/rknpu/device.h b/lite/backends/rknpu/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..9284725aac7fbd9840aef64b7e8f411059f9ba15
--- /dev/null
+++ b/lite/backends/rknpu/device.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "rknpu/rknpu_pub.h"  // NOLINT
+
+namespace paddle {
+namespace lite {
+namespace rknpu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {}
+
+  // Build the RK IR graph to om model, return RK model exector to
+  // load om model and run inference.
+  std::unique_ptr<rk::nn::Exection> Build(
+      std::string& model_name,                                   // NOLINT
+      rk::nn::Graph* rk_graph,                                   // NOLINT
+      std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes,  // NOLINT
+      std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes  // NOLINT
+      );                                                         // NOLINT
+
+ private:
+};
+
+}  // namespace rknpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/CMakeLists.txt b/lite/backends/x86/CMakeLists.txt
index 63b41ae77d0f3949e3d1de13f9db5ca99b4f1c41..38b47ae3120608c7950a1f081e9ec2b133fb955e 100644
--- a/lite/backends/x86/CMakeLists.txt
+++ b/lite/backends/x86/CMakeLists.txt
@@ -10,7 +10,7 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
 lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
-lite_cc_library(x86_cpu_info SRCS cpu_info.cc DEPS xbyak)
+lite_cc_library(x86_cpu_info SRCS cpu_info.cc)
 
 add_subdirectory(jit)
 add_subdirectory(math)
diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc
index a05a57e93b23008e49683764b5ed669d5c425e5b..2aaa798fa94b7dd47e4dc15d50e663b8fd3c083a 100644
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
@@ -262,7 +262,7 @@ void* GetTensorRtDsoHandle() {
 
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib");
+  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml.dylib");
 #elif defined(_WIN32)
   return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll");
 #else
diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc
index 010c80fac4842e74c9b8272db472ddf6cf954771..f78df73f66532f891721c74cff9c78cc3bb61922 100644
--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
@@ -40,7 +40,7 @@ void MatMulJitCode::genCode() {
   for (size_t g = 0; g < groups.size(); ++g) {
     size_t x_offset = 0;
     size_t wgt_offset_tmp = 0;
-    for (int i = 0; i < g; ++i) {
+    for (size_t i = 0; i < g; ++i) {
       wgt_offset_tmp += groups[i] * block_len;
     }
     for (int k = 0; k < k_; ++k) {
diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc
index 7d051aa6f5802844753b71fd43400e20b7f5965b..a3376be423828b25c6eda6fff30a56578c7bbbe5 100644
--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
@@ -28,6 +28,12 @@
 #define posix_memalign_free free
 #endif
 
+#ifdef _WIN32
+#define posix_memalign_free _aligned_free
+#define posix_memalign(p, a, s) \
+  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#endif
+
 // DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
 bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");
 
@@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
 void* GenBase::operator new(size_t size) {
   void* ptr;
   constexpr size_t alignment = 32ul;
+#ifdef _WIN32
+  ptr = _aligned_malloc(size, alignment);
+#else
   PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
                     0,
                     "GenBase Alloc %ld error!",
                     size);
+#endif
   PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
   return ptr;
 }
diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc
index 9cf3281152840416dc141f98992499c663783b7a..5d7e98629cb89bd7a3fdee852507e0f381e54931 100644
--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -265,7 +265,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
     // size_t num_seqs = scores->NumElements(lod_level);
     size_t num_seqs = scores->lod()[lod_level].size() - 1;
     size_t seq_width = 1;
-    for (int i = 1; i < scores->dims().size(); i++) {
+    for (size_t i = 1; i < scores->dims().size(); i++) {
       seq_width *= scores->dims()[i];
     }
 
diff --git a/lite/backends/x86/math/blas.cc b/lite/backends/x86/math/blas.cc
index 2d21adaf5d22930ff720c193696eb00c8035579d..3bc5f9f67ad96e7ec699400ff6369fe48c745b7e 100644
--- a/lite/backends/x86/math/blas.cc
+++ b/lite/backends/x86/math/blas.cc
@@ -23,7 +23,7 @@ namespace math {
 MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
                                      int num_flatten_cols,
                                      bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
+  PADDLE_ENFORCE_GT(tensor_dim.size(), 1u);
   MatDescriptor retv;
   if (num_flatten_cols > 1) {
     auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc
index 34c55c5714e467954bc1bb79d9b1385ef5cfe497..2d00ebad61840da5b14fbf12d9255394b2b2df1a 100644
--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
@@ -46,9 +46,9 @@ class MaxSeqPoolFunctor {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
+    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
+    for (size_t i = 1; i < in_dims.size(); ++i) {
       PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
     }
     PADDLE_ENFORCE_EQ(idx_dims, out_dims);
@@ -95,9 +95,9 @@ class MaxSeqPoolFunctor<T, true> {
                   lite::Tensor* index) {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
+    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
+    for (size_t i = 1; i < in_dims.size(); ++i) {
       PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
     }
 
@@ -138,7 +138,7 @@ class MaxSeqPoolGradFunctor {
     auto idx_dims = index.dims();
     PADDLE_ENFORCE_GT(og_dims.size(), 1);
     PADDLE_ENFORCE_GT(ig_dims.size(), 1);
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
+    for (size_t i = 1; i < og_dims.size(); ++i) {
       PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
     }
     PADDLE_ENFORCE_EQ(idx_dims, og_dims);
diff --git a/lite/backends/x86/parallel.h b/lite/backends/x86/parallel.h
index 0689ec4c234509cee6f10f8e0f7dd432edae5c4e..49794b8e15a8f90a6512798baa842534df879f6b 100644
--- a/lite/backends/x86/parallel.h
+++ b/lite/backends/x86/parallel.h
@@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() {
   // Do not support nested omp parallem.
   num_threads = omp_in_parallel() ? 1 : omp_get_max_threads();
 #endif
-  return std::max(num_threads, 1L);
+  return std::max<int>(num_threads, 1L);
 }
 
 using ThreadHandler =
diff --git a/lite/backends/x86/port.h b/lite/backends/x86/port.h
index c1b81159aca979efe4b46777a1cef49e44b95e27..0e1e2b77b796eae201c55edcd3caecc263e4271e 100644
--- a/lite/backends/x86/port.h
+++ b/lite/backends/x86/port.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
+#include <time.h>
 #include <cstdio>
 #include <stdexcept>
 
-#include <time.h>
 #include <memory>
 #include <string>
 
@@ -37,7 +37,9 @@
 #define GOOGLE_GLOG_DLL_DECL
 #include <io.h>  // _popen, _pclose
 #include <stdio.h>
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
 #include <windows.h>
+#include <winsock.h>
 #include <numeric>  // std::accumulate in msvc
 #ifndef S_ISDIR     // windows port for sys/stat.h
 #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
@@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) {
   return reinterpret_cast<void *>(hModule);
 }
 
+extern struct timeval;
 static int gettimeofday(struct timeval *tp, void *tzp) {
   time_t clock;
   struct tm tm;
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index 278f971b0b1ee8a0b941158839fcc6810e25ad67..55c83cdb4d02d485054ea4d7f3b90fb9f7aa3dc1 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -24,13 +24,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
     proto_library(framework_proto SRCS framework.proto)
 endif()
 
-if (LITE_WITH_X86)
 lite_cc_library(variable SRCS variable.cc DEPS tensor)
 lite_cc_library(types SRCS types.cc)
-else()
-lite_cc_library(variable SRCS variable.cc DEPS tensor)
-lite_cc_library(types SRCS types.cc)
-endif()
 lite_cc_library(op_registry SRCS op_registry.cc DEPS kernel)
 lite_cc_library(scope SRCS scope.cc DEPS tensor)
 lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
@@ -38,7 +33,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
 if (LITE_WITH_ARM)
 lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context)
 else()
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context CUDA_DEPS cuda_context)
 endif()
 
 #-------------------------------------------- GET CODE META INFO ------------------------------------------
diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt
index afc104073684ff00395fb32335630705ff3f7bc8..75971570fb078ce4e39413e5b3df629fe2a7ac3e 100644
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
index 614ee990a9811ab74ceedb4fa000fa385698d679..731215f542567ec3ff0cc87d6990624bfa6b2bc2 100644
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -107,7 +107,7 @@ void TestCase::PrepareInputsForInstruction() {
           CHECK(!shared_tensor_array->empty())
               << "shared_tensor_array is empty yet";
           target_tensor_array->resize(shared_tensor_array->size());
-          for (int i = 0; i < shared_tensor_array->size(); i++) {
+          for (size_t i = 0; i < shared_tensor_array->size(); i++) {
             target_tensor_array->at(i).Resize(
                 shared_tensor_array->at(i).dims());
             TargetCopy(param_type->type->target(),
@@ -219,7 +219,7 @@ bool TestCase::CheckPrecision(const std::string& var_name,
     auto b_tensor_array =
         base_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>();
     CHECK_EQ(a_tensor_array->size(), b_tensor_array->size());
-    for (int i = 0; i < a_tensor_array->size(); i++) {
+    for (size_t i = 0; i < a_tensor_array->size(); i++) {
       Tensor* a_tensor = &(a_tensor_array->at(i));
       Tensor* b_tensor = &(b_tensor_array->at(i));
       if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) {
diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h
index 7050355fbfae55b9ba626119cd95f8e952c27430..20a0792155f0b4ea8faa7c3fc15ea5c4767352ac 100644
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -166,7 +166,7 @@ class TestCase {
   // TODO(Superjomn) Move this method to utils or DDim?
   bool ShapeEquals(const DDim& a, const DDim& b) {
     if (a.size() != b.size()) return false;
-    for (int i = 0; i < a.size(); i++) {
+    for (size_t i = 0; i < a.size(); i++) {
       if (a[i] != b[i]) return false;
     }
     return true;
diff --git a/lite/core/context.h b/lite/core/context.h
index 061638d63f5187bbfe296afbc3679d9b390a6457..bacb570a903d807945cb9e2a8b98615fcaba9384 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -16,8 +16,7 @@
 
 #include "lite/utils/any.h"
 #ifdef LITE_WITH_CUDA
-#include "lite/backends/cuda/blas.h"
-#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/context.h"
 #endif
 #ifdef LITE_WITH_OPENCL
 #include <unordered_map>
@@ -53,14 +52,15 @@ class Context;
 
 using HostContext = Context<TargetType::kHost>;
 using X86Context = Context<TargetType::kX86>;
-using CUDAContext = Context<TargetType::kCUDA>;
 using ARMContext = Context<TargetType::kARM>;
 using NPUContext = Context<TargetType::kNPU>;
+using APUContext = Context<TargetType::kAPU>;
 using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
 using MLUContext = Context<TargetType::kMLU>;
+using RKNPUContext = Context<TargetType::kRKNPU>;
 
 template <>
 class Context<TargetType::kHost> {
@@ -88,6 +88,21 @@ class Context<TargetType::kNPU> {
 };
 #endif
 
+#ifdef LITE_WITH_APU
+template <>
+class Context<TargetType::kAPU> {
+ public:
+  Context() {}
+  explicit Context(const APUContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(APUContext* ctx) {}
+
+  APUContext& operator=(const APUContext& ctx) {}
+  std::string name() const { return "APUContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_BM
 template <>
 class Context<TargetType::kBM> {
@@ -105,6 +120,21 @@ class Context<TargetType::kBM> {
 };
 #endif
 
+#ifdef LITE_WITH_RKNPU
+template <>
+class Context<TargetType::kRKNPU> {
+ public:
+  Context() {}
+  explicit Context(const RKNPUContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(RKNPUContext* ctx) {}
+
+  RKNPUContext& operator=(const RKNPUContext& ctx) {}
+  std::string name() const { return "RKNPUContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_XPU
 template <>
 class Context<TargetType::kXPU> {
@@ -286,103 +316,6 @@ class Context<TargetType::kMLU> {
 };
 #endif  // LITE_WITH_MLU
 
-#ifdef LITE_WITH_CUDA
-// Only works with CUDA kernels.
-template <>
-class Context<TargetType::kCUDA> {
- public:
-  typename Env<TargetType::kCUDA>::Devs& devs =
-      Env<TargetType::kCUDA>::Global();
-  // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() {
-    if (devs.size() > 0) {
-      cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
-    } else {
-      LOG(INFO) << "No cuda device(s) found, CUDAContext init failed.";
-    }
-  }
-  void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
-    CHECK_GT(devs.size(), 0UL)
-        << "Env is not initialized or current target is not exit!";
-    if (dev_id >= static_cast<int>(devs.size())) {
-      LOG(WARNING) << "device index exceeds the number of devices, set to "
-                      "default device(0)!";
-      device_id_ = 0;
-    } else {
-      device_id_ = dev_id;
-    }
-    if (io_stream_id >= devs[dev_id].max_stream()) {
-      LOG(WARNING) << "data stream index exceeds the maximum stream number, "
-                      "set to default stream(0)!";
-      io_stream_id = 0;
-    }
-    if (exec_stream_id >= devs[dev_id].max_stream()) {
-      LOG(WARNING) << "exec stream index exceeds the maximum stream number, "
-                      "set to default stream(0)!";
-      exec_stream_id = 0;
-    }
-
-    exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id];
-    io_stream_ = devs[dev_id].io_streams()[io_stream_id];
-
-    exec_stream_id_ = exec_stream_id;
-    io_stream_id_ = io_stream_id;
-  }
-  void CopySharedTo(CUDAContext* ctx) {
-    CHECK(ctx);
-    CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
-    ctx->cublas_fp32_ = cublas_fp32_;
-  }
-
-  const cudaStream_t& exec_stream() const { return exec_stream_; }
-  void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
-
-  const cudaStream_t& io_stream() const { return io_stream_; }
-  void SetIoStream(cudaStream_t stream) { io_stream_ = stream; }
-
-  std::shared_ptr<cuda::Blas<float>> cublas_fp32() { return cublas_fp32_; }
-  void SetCuBlasFP32(std::shared_ptr<cuda::Blas<float>> cublas_fp32) {
-    cublas_fp32_ = cublas_fp32;
-  }
-
-  const std::vector<cudaEvent_t>& input_events() { return input_events_; }
-  void SetInputEvents(const std::vector<cudaEvent_t>& input_events) {
-    input_events_.clear();
-    input_events_.assign(input_events.begin(), input_events.end());
-  }
-
-  const std::vector<cudaEvent_t>& output_events() { return output_events_; }
-  void SetOutputEvents(const std::vector<cudaEvent_t>& output_events) {
-    output_events_.clear();
-    output_events_.assign(output_events.begin(), output_events.end());
-  }
-
-  std::string name() const { return "CUDAContext"; }
-
-  CUDAContext& operator=(const CUDAContext& context) {
-    this->Init(
-        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
-    cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
-    return *this;
-  }
-
- private:
-  int device_id_;
-  // overall information
-  int exec_stream_id_;
-  int io_stream_id_;
-  cudaStream_t exec_stream_;
-  cudaStream_t io_stream_;
-
-  // not thread-safe, should allocate for each thread.
-  std::shared_ptr<cuda::Blas<float>> cublas_fp32_;
-
-  // kernel information
-  std::vector<cudaEvent_t> input_events_;
-  std::vector<cudaEvent_t> output_events_;
-};
-#endif
-
 #ifdef LITE_WITH_X86
 template <>
 class Context<TargetType::kX86> {
@@ -455,7 +388,9 @@ class ContextScheduler {
     return *x;
   }
 
-  std::unique_ptr<KernelContext> NewContext(TargetType target) {
+  std::unique_ptr<KernelContext> NewContext(
+      TargetType target,
+      /*only used for cuda context*/ int exec_stream_id = 0) {
     std::unique_ptr<KernelContext> ctx(new KernelContext);
     switch (target) {
       case TARGET(kHost):
@@ -472,7 +407,7 @@ class ContextScheduler {
       case TARGET(kCUDA): {
         int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
         auto& context = ctx->As<CUDAContext>();
-        context.Init(dev_id);
+        context.Init(dev_id, exec_stream_id);
         kernel_contexts_[TargetType::kCUDA].As<CUDAContext>().CopySharedTo(
             &context);
       } break;
@@ -489,6 +424,18 @@ class ContextScheduler {
             &ctx->As<NPUContext>());
         break;
 #endif
+#ifdef LITE_WITH_APU
+      case TARGET(kAPU):
+        kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
+            &ctx->As<APUContext>());
+        break;
+#endif
+#ifdef LITE_WITH_RKNPU
+      case TARGET(kRKNPU):
+        kernel_contexts_[TargetType::kRKNPU].As<RKNPUContext>().CopySharedTo(
+            &ctx->As<RKNPUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_XPU
       case TARGET(kXPU):
         kernel_contexts_[TargetType::kXPU].As<XPUContext>().CopySharedTo(
@@ -558,6 +505,12 @@ class ContextScheduler {
 #ifdef LITE_WITH_NPU
     InitContext<TargetType::kNPU, NPUContext>();
 #endif
+#ifdef LITE_WITH_APU
+    InitContext<TargetType::kAPU, APUContext>();
+#endif
+#ifdef LITE_WITH_RKNPU
+    InitContext<TargetType::kRKNPU, RKNPUContext>();
+#endif
 #ifdef LITE_WITH_XPU
     InitContext<TargetType::kXPU, XPUContext>();
 #endif
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index 29ac96ed744b016833a746b35002dd68109efd8b..09da06a4168268c670577c159a2a306a8959d81d 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -947,7 +947,7 @@ void DeviceInfo::RequestPowerNoBindMode(int thread_num) {
     active_ids_ = core_ids_;
   } else {
     active_ids_.resize(thread_num);
-    for (int i = 0; i < thread_num; ++i) {
+    for (uint32_t i = 0; i < thread_num; ++i) {
       if (i < big_core_ids_.size()) {
         active_ids_[i] = big_core_ids_[i];
       } else {
diff --git a/lite/core/device_info.h b/lite/core/device_info.h
index a108ae3d4b564aaac02a63ead9a35eba26a6cf63..b06eb8d944735971133bb7a29aa0f06075e60626 100644
--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -159,7 +159,7 @@ class Env {
     static Devs* devs = new Devs();
     return *devs;
   }
-  static void Init(int max_stream = 4) {
+  static void Init(int max_stream = 6) {
 #ifdef LITE_WITH_MLU
     CNRT_CALL(cnrtInit(0));
 #endif
@@ -175,6 +175,7 @@ class Env {
     } else {
       LOG(INFO) << "Found " << count << " device(s)";
     }
+    CHECK_GT(max_stream, 0) << "max_stream must be greater than 0.";
     // create all device
     for (int i = 0; i < count; i++) {
       auto dev = Device<Type>(i, max_stream);
@@ -234,8 +235,8 @@ class Device<TARGET(kCUDA)> {
   std::string name() { return device_prop_.name; }
   int core_num() { return device_prop_.multiProcessorCount; }
   float max_memory() { return device_prop_.totalGlobalMem / 1048576.; }
-  std::vector<cudaStream_t> exec_streams() { return exec_stream_; }
-  std::vector<cudaStream_t> io_streams() { return io_stream_; }
+  const std::vector<cudaStream_t>& exec_streams() { return exec_stream_; }
+  const std::vector<cudaStream_t>& io_streams() { return io_stream_; }
 
   int sm_version() { return sm_version_; }
   bool has_fp16() { return has_fp16_; }
diff --git a/lite/core/kernel.cc b/lite/core/kernel.cc
index 7ec718cb3881c10dec08376419b419777c71bba6..194d736a4c0cf6fa18eae119589c5fa1fd08bca0 100644
--- a/lite/core/kernel.cc
+++ b/lite/core/kernel.cc
@@ -57,7 +57,7 @@ void KernelBase::ParseKernelType(const std::string &kernel_type,
                                  std::string *alias,
                                  Place *place) {
   auto parts = Split(kernel_type, "/");
-  CHECK_EQ(parts.size(), 5);
+  CHECK_EQ(parts.size(), 5u);
   *op_type = parts[0];
   *alias = parts[1];
 
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index 91accc907ed16b2de64e5982b88d38029fd2902b..d036bf7988b98e64586e42683d33b4696e9ff706 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -37,6 +37,7 @@ lite_cc_library(mir_passes
       demo_pass.cc
       runtime_context_assign_pass.cc
       memory_optimize_pass.cc
+      multi_stream_analysis_pass.cc
       mlu_postprocess_pass.cc
       weight_quantization_preprocess_pass.cc
       quantized_op_attributes_inference_pass.cc
diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc
index 150a6e68d8a924ebfa96fdffb99e28b230689a48..143a7cecce8c1c45ada9ad31e8e4bea5447fec68 100644
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -116,8 +116,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   }
   size_t weight_num = conv_weight_t->data_size();
   bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
-  bool is_weight_quantization =
-      conv_op_desc->HasAttr("quantize_weight_bits") ? true : false;
+  bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits");
 
   // comupte BN alpha and beta
   Tensor alpha_tensor, beta_tensor;
@@ -164,23 +163,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
       int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                    conv_weight_t->dims()[3];
       int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
-      for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
-        for (unsigned int i = 0; i < h; ++i) {
+      for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+        for (int i = 0; i < h; ++i) {
           weight_scale[i] *= fabsf(alpha_data[i]);
           if (alpha_data[i] < 0.f) {
             auto ptr_row = conv_weight_d + k * c_size + i * hw;
-            for (unsigned int j = 0; j < hw; ++j) {
+            for (int j = 0; j < hw; ++j) {
               ptr_row[j] *= -1;
             }
           }
         }
       }
     } else {
-      for (unsigned int i = 0; i < h; ++i) {
+      for (int i = 0; i < h; ++i) {
         weight_scale[i] *= fabsf(alpha_data[i]);
         if (alpha_data[i] < 0.f) {
           auto ptr_row = conv_weight_d + i * w;
-          for (unsigned int j = 0; j < w; ++j) {
+          for (int j = 0; j < w; ++j) {
             ptr_row[j] *= -1;
           }
         }
@@ -204,17 +203,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
       int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                    conv_weight_t->dims()[3];
       int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
-      for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
-        for (unsigned int i = 0; i < h; ++i) {
+      for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+        for (int i = 0; i < h; ++i) {
           auto ptr_row = conv_weight_d + k * c_size + i * hw;
-          for (unsigned int j = 0; j < hw; ++j) {
+          for (int j = 0; j < hw; ++j) {
             ptr_row[j] *= alpha_data[i];
           }
         }
       }
     } else {
-      for (unsigned int i = 0; i < h; ++i) {    // n: conv2d output channels
-        for (unsigned int j = 0; j < w; ++j) {  // w: conv2d input channels
+      for (int i = 0; i < h; ++i) {    // n: conv2d output channels
+        for (int j = 0; j < w; ++j) {  // w: conv2d input channels
           conv_weight_d[i * w + j] *= alpha_data[i];
         }
       }
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index a3a98b871fb4b6f8230299cda978b0f1f8faa779..2c7cc2fe5547d6004ded99f28698478cec0a3639 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -260,7 +260,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
   auto channel_scale_tensor =
       scope->FindVar(channel_scale_name)->GetMutable<lite::Tensor>();
   auto* channel_scale_data = channel_scale_tensor->data<float>();
-  for (int i = 0; i < channel_scale_tensor->data_size(); i++) {
+  for (size_t i = 0; i < channel_scale_tensor->data_size(); i++) {
     weight_scale.push_back(channel_scale_data[i] / range);
   }
 
diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc
index 76c97d2da6ed9e7c6fc1f1889d80095278b68ec0..d7486c0933dbbe74115bd6358962817b2b946c12 100644
--- a/lite/core/mir/generate_program_pass.cc
+++ b/lite/core/mir/generate_program_pass.cc
@@ -14,6 +14,7 @@
 
 #include "lite/core/mir/generate_program_pass.h"
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
@@ -25,10 +26,37 @@ namespace mir {
 
 void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   VLOG(4) << "final program \n" << Visualize(graph.get());
-  for (auto& item : graph->StmtTopologicalOrder()) {
+  std::vector<Node*> nodes_in_order;
+#ifdef LITE_WITH_CUDA
+  const std::string depend_pass = "multi_stream_analysis_pass";
+  const std::string attr_name = "nodes_in_order";
+  mir::Pass* pass = mir::PassManager::Global().LookUp(depend_pass);
+  if (pass->HasAttr(attr_name)) {
+    nodes_in_order = pass->GetAttr<std::vector<Node*>>(attr_name);
+  }
+#endif
+  if (nodes_in_order.empty()) {
+    nodes_in_order = graph->StmtTopologicalOrder();
+  }
+
+  for (auto& item : nodes_in_order) {
     if (item->IsStmt()) {
       auto& stmt = item->AsStmt();
       VLOG(4) << stmt;
+#ifdef LITE_WITH_CUDA
+      if (stmt.kernels().front()->target() == TargetType::kCUDA) {
+        stmt.kernels()
+            .front()
+            ->mutable_context()
+            ->As<CUDAContext>()
+            .SetNeedSync(stmt.need_sync_);
+        stmt.kernels()
+            .front()
+            ->mutable_context()
+            ->As<CUDAContext>()
+            .SetSyncStreams(stmt.sync_streams_);
+      }
+#endif
       insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
     }
   }
diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc
index a32c9c05f69e5c31b77bc0d2ff976560f29b9bec..55b7a004567ec5a5298e084839d6dcf5a8591882 100644
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -85,7 +85,23 @@ std::string Visualize(mir::SSAGraph* graph) {
     if (!node->IsStmt()) continue;
     auto op_info = node->AsStmt().op_info();
     auto op_type = op_info->Type();
-    std::string op_name = string_format("%s%d", op_type.c_str(), op_idx++);
+    std::string op_name;
+    if (node->AsStmt().need_sync_) {
+      std::ostringstream oss;
+      for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) {
+        oss << std::to_string(node->AsStmt().sync_streams_[i]);
+        if (i != node->AsStmt().sync_streams_.size() - 1) {
+          oss << ",";
+        }
+      }
+      op_name = string_format("%s%d, stream=%d, sync_streams={%s}",
+                              op_type.c_str(),
+                              op_idx++,
+                              node->AsStmt().stream_id_,
+                              oss.str().c_str());
+    } else {
+      op_name = string_format("%s%d", op_type.c_str(), op_idx++);
+    }
     // Add its input&output variables as the Dot nodes
     dot.AddNode(op_name,
                 {Dot::Attr("shape", "box"),
@@ -93,7 +109,13 @@ std::string Visualize(mir::SSAGraph* graph) {
                  Dot::Attr("color", "black"),
                  Dot::Attr("fillcolor", "yellow")});
     for (auto& x : node->inlinks) {
-      auto var_name = x->AsArg().name;
+      std::string var_name;
+      if (x->AsArg().lane != -1) {
+        var_name = string_format(
+            "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane);
+      } else {
+        var_name = x->AsArg().name;
+      }
       if (!exists_var_names.count(var_name)) {
         dot.AddNode(var_name, {});
         exists_var_names.insert(var_name);
@@ -101,7 +123,13 @@ std::string Visualize(mir::SSAGraph* graph) {
       dot.AddEdge(var_name, op_name, {});
     }
     for (auto& x : node->outlinks) {
-      auto var_name = x->AsArg().name;
+      std::string var_name;
+      if (x->AsArg().lane != -1) {
+        var_name = string_format(
+            "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane);
+      } else {
+        var_name = x->AsArg().name;
+      }
       if (!exists_var_names.count(var_name)) {
         dot.AddNode(var_name, {});
         exists_var_names.insert(var_name);
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index 38293ede76ed35bf05767ce1333947b7dfdbc4ac..12b4eab0a9582af6d2d4abd3941e75b99a3e39a6 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -313,4 +313,8 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
     .BindTargets({TARGET(kARM), TARGET(kOpenCL)})
-    .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
+    .ExcludeTargets({TARGET(kNPU),
+                     TARGET(kXPU),
+                     TARGET(kBM),
+                     TARGET(kRKNPU),
+                     TARGET(kAPU)});
diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc
index 15f62f36b0f026dc42ecbb274c946e294c7fc44e..ba48d5d4ead5ea922ded0bff3a87c2c127595790 100644
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -292,7 +292,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
 
   // get subgraph op's type info
   size_t kernel_size = inst_node->AsStmt().kernels().size();
-  CHECK_GT(kernel_size, 0);
+  CHECK_GT(kernel_size, 0u);
   VLOG(4) << "subgraph kernel size: " << kernel_size;
 
   for (size_t i = 0; i < kernel_size; ++i) {
@@ -450,7 +450,7 @@ bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
   auto* block_desc =
       static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
           ->GetSubBlock();
-  for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
+  for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
     auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
     CHECK(op_desc);
     if (op_desc->Type() == "conv2d") {
diff --git a/lite/core/mir/multi_stream_analysis_pass.cc b/lite/core/mir/multi_stream_analysis_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46454a1fc357c7d96162a58a43a6c34bc890bc69
--- /dev/null
+++ b/lite/core/mir/multi_stream_analysis_pass.cc
@@ -0,0 +1,313 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/multi_stream_analysis_pass.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lite/core/device_info.h"
+#include "lite/core/mir/graph_visualize_pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void MultiStreamAnalysisPass::CleanUp() {
+  exec_ops_.clear();
+  wait_que_.clear();
+  wait_que_cpu_.clear();
+  std::queue<int> empty_queue;
+  while (!exec_que_.empty()) {
+    exec_que_.pop();
+  }
+  ops_in_streams_.clear();
+  resources_.clear();
+  map_arg_to_lane_.clear();
+  op_types_set_.clear();
+  io_copy_once_num_ = 0;
+}
+
+void MultiStreamAnalysisPass::Init(SSAGraph* graph) {
+  // If not cleaned, the clone will overlay the previous state
+  CleanUp();
+  for (auto& op_node : graph->StmtTopologicalOrder()) {
+    if (op_node->IsStmt()) {
+      // Set all outputs of op to inaccessible state.
+      auto outputs = op_node->outlinks;
+      for (Node* node : outputs) {
+        CHECK(node->IsArg());
+        auto& arg = node->AsArg();
+        if (!resources_.count(arg.name)) {
+          resources_[arg.name] = false;
+        }
+      }
+      // Set the weight input of op to be accessible.
+      auto inputs = op_node->inlinks;
+      for (Node* node : inputs) {
+        CHECK(node->IsArg());
+        auto& arg = node->AsArg();
+        if (arg.is_weight || arg.is_persist) {
+          resources_[arg.name] = true;
+        }
+      }
+
+      // feed and io_copy_once op has no dependencies and can be launched
+      // directly. Other ops are put into the waiting queue.
+      if (op_node->AsStmt().op_type() == "feed" ||
+          op_node->AsStmt().op_type() == "io_copy_once") {
+        exec_que_.push(op_node);
+      } else {
+        auto tgt = op_node->AsStmt().kernels().front()->target();
+        if (tgt == TargetType::kCUDA) {
+          wait_que_.push_back(op_node);
+        } else {
+          wait_que_cpu_.push_back(op_node);
+        }
+      }
+      op_types_set_.insert(op_node->AsStmt().op_type());
+    }
+  }
+
+  // Set the stream id according to the number of feed ops, and set the output
+  // of the feed op to be accessible.
+  int lane = 0;
+  auto nodes = graph->inputs();
+  ops_in_streams_.resize(max_stream_);
+
+  for (auto& node : nodes) {
+    std::string::size_type idx = node->AsArg().name.find("feed");
+    if (idx != std::string::npos) {
+      for (auto& feed_ops : node->outlinks) {
+        if (feed_ops->AsStmt().op_type() == "feed") {
+          // feed op doesn't need to wait sync.
+          feed_ops->AsStmt().need_sync_ = false;
+          CHECK_EQ(static_cast<int>(feed_ops->outlinks.size()), 1)
+              << "feed op must have one output.";
+          for (auto& var : feed_ops->outlinks) {
+            var->AsArg().lane = lane;
+            map_arg_to_lane_[var->AsArg().name] = lane;
+            resources_[var->AsArg().name] = true;
+          }
+          feed_ops->AsStmt().stream_id_ = lane;
+          ops_in_streams_[lane].push_back(feed_ops);
+          ++lane;
+          if (lane >= max_stream_) {
+            lane = 0;
+          }
+        }
+      }
+    }
+    // set all io_copy_once op in the first stream
+    for (auto& io_copy_once_ops : node->outlinks) {
+      if (io_copy_once_ops->AsStmt().op_type() == "io_copy_once") {
+        ops_in_streams_[0].push_back(io_copy_once_ops);
+        io_copy_once_ops->AsStmt().stream_id_ = 0;
+        io_copy_once_ops->AsStmt().need_sync_ = false;
+        ++io_copy_once_num_;
+      }
+    }
+  }
+}
+
+bool MultiStreamAnalysisPass::CheckOpSupport() {
+  std::unordered_set<std::string> invalid_op = {
+      "while", "conditional_block", "conditional_block_infer", "graph_op"};
+  for (auto& op_type : op_types_set_) {
+    if (invalid_op.count(op_type)) {
+      LOG(INFO) << "multi_stream_analysis_pass don't support " << op_type
+                << ", just return.";
+      return false;
+    }
+  }
+  return true;
+}
+
+bool MultiStreamAnalysisPass::IsPrepared(Node* stmt_node) {
+  // feed op are prepared when init.
+  std::string op_name = stmt_node->AsStmt().op_type();
+  if (op_name == "feed") {
+    return true;
+  }
+
+  // Check is op's input are all accessible.
+  std::vector<std::string> args;
+  for (auto* ins : stmt_node->inlinks) {
+    args.push_back(ins->AsArg().name);
+  }
+  return CheckAccess(args);
+}
+
+bool MultiStreamAnalysisPass::CheckAccess(
+    const std::vector<std::string>& args) {
+  if (args.size() == 0) {
+    return true;
+  }
+  for (auto& name : args) {
+    if (resources_[name]) {
+      continue;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+int MultiStreamAnalysisPass::SelectStreamId(const std::vector<int>& lanes) {
+  if (lanes.size() == 0) {
+    return 0;
+  }
+
+  int res = lanes[0];
+  int exclude_io_copy_once_num = ops_in_streams_[0].size() - io_copy_once_num_;
+  int min_num = lanes[0] == 0 ? exclude_io_copy_once_num
+                              : ops_in_streams_[lanes[0]].size();
+  for (size_t i = 1; i < lanes.size(); ++i) {
+    int ith_num = lanes[i] == 0 ? exclude_io_copy_once_num
+                                : ops_in_streams_[lanes[i]].size();
+    if (ith_num < min_num) {
+      res = lanes[i];
+      min_num = ith_num;
+    }
+  }
+
+  return res;
+}
+
+void MultiStreamAnalysisPass::Launch(Node* stmt_node) {
+  // record ops launch order.
+  exec_que_.push(stmt_node);
+  std::vector<int> lanes;
+  for (auto& in_arg : stmt_node->inlinks) {
+    // Weight parameter does not involve stream id, so just skip it.
+    if (in_arg->AsArg().is_weight || in_arg->AsArg().is_persist) {
+      continue;
+    }
+
+    if (std::find(lanes.begin(), lanes.end(), in_arg->AsArg().lane) ==
+        lanes.end()) {
+      lanes.push_back(in_arg->AsArg().lane);
+    }
+  }
+
+  int stream_id = SelectStreamId(lanes);
+
+  // If all inputs of the op are on multiple streams, they need to be
+  // synchronized
+  if (lanes.size() > 1) {
+    for (size_t i = 0; i < lanes.size(); ++i) {
+      if (lanes[i] != stream_id) {
+        stmt_node->AsStmt().sync_streams_.push_back(lanes[i]);
+      }
+    }
+    stmt_node->AsStmt().need_sync_ = true;
+  }
+  // io_copy are nodes inserted across devices and need to be synced.
+  if (stmt_node->AsStmt().op_type() == "io_copy") {
+    stmt_node->AsStmt().need_sync_ = true;
+  }
+  stmt_node->AsStmt().stream_id_ = stream_id;
+
+  // set output lane and set the output of op to be accessible.
+  for (auto& out_arg : stmt_node->outlinks) {
+    out_arg->AsArg().lane = stream_id;
+    resources_[out_arg->AsArg().name] = true;
+  }
+  ops_in_streams_[stream_id].push_back(stmt_node);
+}
+
+void MultiStreamAnalysisPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+#ifdef LITE_WITH_CUDA
+  typename Env<TargetType::kCUDA>::Devs& devs =
+      Env<TargetType::kCUDA>::Global();
+  int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
+  max_stream_ = devs[dev_id].max_stream();
+#else
+  LOG(FATAL) << "Please re-compile by setting the cmake flag LITE_WITH_CUDA=ON";
+#endif
+
+  // Find the correct startup sequence for op.
+  Init(graph.get());
+  bool is_valid = CheckOpSupport();
+  if (!is_valid) {
+    return;
+  }
+  size_t prev_size;
+
+  while (!(this->wait_que_.empty() && this->wait_que_cpu_.empty())) {
+    prev_size = this->wait_que_.size() + this->wait_que_cpu_.size();
+    // launch the acessible cuda kernel and remove it from wait que.
+    for (auto it = this->wait_que_.begin(); it != this->wait_que_.end();) {
+      if (IsPrepared(*it)) {
+        Launch(*it);
+        it = wait_que_.erase(it);
+      } else {
+        ++it;
+      }
+    }
+    // launch the accessible cpu kernel and remove it from wait que.
+    for (auto cpu_it = this->wait_que_cpu_.begin();
+         cpu_it != this->wait_que_cpu_.end();) {
+      if (IsPrepared(*cpu_it)) {
+        Launch(*cpu_it);
+        cpu_it = wait_que_cpu_.erase(cpu_it);
+      } else {
+        ++cpu_it;
+      }
+    }
+
+    if (this->wait_que_.size() + this->wait_que_cpu_.size() == prev_size) {
+      LOG(FATAL) << "network topo error!";
+    }
+  }
+
+  // Get exec ops order.
+  while (!exec_que_.empty()) {
+    auto* node = exec_que_.front();
+    exec_ops_.push_back(node);
+    VLOG(4) << node->AsStmt().op_type()
+            << " stream: " << node->AsStmt().stream_id_
+            << ", sync: " << node->AsStmt().need_sync_;
+    if (node->AsStmt().need_sync_) {
+      for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) {
+        VLOG(4) << "        " << node->AsStmt().sync_streams_[i];
+      }
+    }
+    exec_que_.pop();
+  }
+
+  // Set attribute parameters, for passing parameters between passes
+  const std::string attr_name{"nodes_in_order"};
+  SetAttr<std::vector<Node*>>(attr_name, &exec_ops_);
+
+  LOG(INFO) << "stream " << 0 << " has "
+            << ops_in_streams_[0].size() - io_copy_once_num_
+            << " ops. (exclude io_copy_once).";
+  for (size_t i = 1; i < ops_in_streams_.size(); ++i) {
+    LOG(INFO) << "stream " << i << " has " << ops_in_streams_[i].size()
+              << " ops.";
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(multi_stream_analysis_pass,
+                  paddle::lite::mir::MultiStreamAnalysisPass)
+    .BindTargets({TARGET(kCUDA)});
diff --git a/lite/core/mir/multi_stream_analysis_pass.h b/lite/core/mir/multi_stream_analysis_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..37a7feca3a1200ad7ff26ef8fc0317deee9d174e
--- /dev/null
+++ b/lite/core/mir/multi_stream_analysis_pass.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "lite/core/kernel.h"
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+/*
+ * MultiStreamAnalysisPass will find the correct launch sequence for all ops.
+ * Ideally, the order should be multiple asynchronous ops and a small number of
+ * synchronous ops.
+ */
+class MultiStreamAnalysisPass : public StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+
+ private:
+  // Init resource list. Set all ops except feed to inaccessible state and set
+  // stream id according to the numer of inputs.
+  void Init(SSAGraph* graph);
+
+  // Clean state information of all member variables.
+  void CleanUp();
+
+  // After launching, unlock the output resources of op.
+  void Launch(Node* stmt_node);
+
+  // If all inputs of an op are accessible, the op is considered to be in the
+  // prepared state
+  bool IsPrepared(Node* stmt_node);
+
+  // Determine if all inputs of op are accessible.
+  bool CheckAccess(const std::vector<std::string>& args);
+
+  // The logic of selecting a stream:
+  // 1. Make the number of ops on each stream as close as possible.
+  // 2. The selected stream must be one of the streams contained in the input
+  // arg
+  int SelectStreamId(const std::vector<int>& lanes);
+
+  // Check if the model's ops are all supported. If you encounter unsupported
+  // ops, exit
+  bool CheckOpSupport();
+
+ private:
+  std::list<Node*> wait_que_;
+  std::list<Node*> wait_que_cpu_;
+  std::queue<Node*> exec_que_;
+  std::vector<Node*> exec_ops_;
+  std::vector<std::vector<Node*>> ops_in_streams_;
+  std::unordered_map<std::string, bool> resources_;
+  std::unordered_map<std::string, int> map_arg_to_lane_;
+  int max_stream_;
+  int io_copy_once_num_;
+  std::unordered_set<std::string> op_types_set_;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h
index 45b15812fadb0789edea3f89fb00b4612bdb010f..ae7b112d9157de3f53c409dfc89bf1273531e05f 100644
--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -80,6 +80,12 @@ class Node {
 
     // Description.
     std::string desc;
+
+    // for cuda multi stream
+    bool need_sync_{false};
+    int stream_id_{0};
+    // streams which need to be sync. exclude stream_id_
+    std::vector<int> sync_streams_{};
   };
 
   struct Arg {
@@ -93,6 +99,7 @@ class Node {
     // if the need more than one tool operator(eg. io_copy layout calib), the
     // argument between them should be persist to make sure it's only run once
     bool is_persist{false};
+    int lane{-1};
   };
 
   Arg& AsArg(const std::string& name, int id);
diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h
index 4e8c8be292bbd5e7f46664378634d4f1aeed2965..64f2db82c0b1b0b863c1aa61b3b2affea5f85d89 100644
--- a/lite/core/mir/pass.h
+++ b/lite/core/mir/pass.h
@@ -17,9 +17,11 @@
 #include <set>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 #include "lite/core/mir/node.h"
 #include "lite/core/mir/ssa_graph.h"
+#include "lite/utils/varient.h"
 
 namespace paddle {
 namespace lite {
@@ -121,6 +123,27 @@ class Pass {
 
   virtual ~Pass() = default;
 
+  bool HasAttr(const std::string& attr_name) const {
+    return pass_attrs_.count(attr_name) > 0;
+  }
+
+  // Set a pointer to the attribute. Specific pass itself takes ownership of the
+  // attribute.
+  template <typename AttrType>
+  void SetAttr(const std::string& attr_name, const AttrType* attr) {
+    VLOG(4) << "Setting the attribute " << attr_name << " for the pass "
+            << name_;
+    pass_attrs_[attr_name].set<const AttrType>(*attr);
+  }
+
+  // Get a reference to the attribute previously set.
+  template <typename AttrType>
+  const AttrType& GetAttr(const std::string& attr_name) const {
+    CHECK(pass_attrs_.count(attr_name))
+        << attr_name << " attr not register for pass " << name_;
+    return pass_attrs_.at(attr_name).get<const AttrType>();
+  }
+
  private:
   const Kind kind_;
   std::string name_;
@@ -128,6 +151,8 @@ class Pass {
   std::set<TargetType> bound_targets_;
   std::set<TargetType> excluded_targets_;
   std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_;
+  std::unordered_map<std::string, variant<Node, std::vector<Node*>>>
+      pass_attrs_;
 };
 
 // Different kinds.
diff --git a/lite/core/mir/pass_registry.h b/lite/core/mir/pass_registry.h
index 849f80aea2191b72ac423c7125a4e69cb6927be5..170de1cd31ffd31662eb98898ad795993a36289e 100644
--- a/lite/core/mir/pass_registry.h
+++ b/lite/core/mir/pass_registry.h
@@ -59,6 +59,9 @@ class PassRegistry {
 }  // namespace lite
 }  // namespace paddle
 
+// some platform-independent defintion
+#include "lite/utils/macros.h"
+
 #define REGISTER_MIR_PASS(name__, class__)                                \
   paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__,      \
                                                             new class__); \
@@ -66,4 +69,4 @@ class PassRegistry {
     return mir_pass_registry##name__.Touch();                             \
   }                                                                       \
   static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__  \
-      __attribute__((unused)) = mir_pass_registry##name__
+      UNUSED = mir_pass_registry##name__
diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc
index 40cad8f6af75300ab85753b16e391daeeadc6c2f..187e6b634fcf9d38cb32b7ca936ac8039c1717cf 100644
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply(
 
 REGISTER_MIR_PASS(quantized_op_attributes_inference_pass,
                   paddle::lite::mir::QuantizedOpAttributesInferencePass)
-    .BindTargets({TARGET(kNPU)});
+    .BindTargets({TARGET(kAPU), TARGET(kRKNPU)});
diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc
index 3cbe602f31a87c6ddb42d36fe75e52e8347695d8..5b6f968484b7b49838a004c3edfd00ff9b7e5e5e 100644
--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ b/lite/core/mir/runtime_context_assign_pass.cc
@@ -45,9 +45,10 @@ class RuntimeContextAssignPass : public StmtPass {
             inst.picked_kernel().target()));
       }
 #else
-      inst.picked_kernel().SetContext(
-          ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
+      int stream_id = inst.stream_id_;
 
+      inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+          inst.picked_kernel().target(), stream_id));
 #endif
     }
   }
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
index b61f7f365f51a32e267dd12943be5fcfadb3e08a..6bab454c42a68a7513aa01ff06cc2be6c970e199 100644
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -47,8 +47,8 @@ std::string SubgraphVisualizer::operator()() {
       "turquoise4",   "snow3",          "sienna4",        "salmon2",
   };
   std::unordered_map<Node *, int> subgraph_indices;
-  for (int i = 0; i < subgraphs_.size(); i++) {
-    for (int j = 0; j < subgraphs_[i].size(); j++) {
+  for (size_t i = 0; i < subgraphs_.size(); i++) {
+    for (size_t j = 0; j < subgraphs_[i].size(); j++) {
       subgraph_indices[subgraphs_[i][j]] = i;
     }
   }
@@ -538,7 +538,8 @@ void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph,
   std::vector<std::vector<Node *>> subgraphs =
       SubgraphDetector(graph, teller)();
   SubgraphVisualizer(graph, subgraphs)();
-  for (int subgraph_idx = 0; subgraph_idx < subgraphs.size(); subgraph_idx++) {
+  for (size_t subgraph_idx = 0; subgraph_idx < subgraphs.size();
+       subgraph_idx++) {
     if (subgraphs[subgraph_idx].size() >= min_subgraph_size) {
       InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]);
     }
diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc
index 1e54e1497b5d49754a705340aafa30ded1c2a727..f52c0332fa3cfce904d2b7c8bf010bc3d3ac6ac9 100644
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -36,8 +36,8 @@ std::vector<std::string> AddFCDesc(
     const std::shared_ptr<Scope>& scope,
     const std::vector<std::string>& input_var_names,
     const std::vector<int64_t>& wshape) {
-  CHECK_EQ(input_var_names.size(), 1);
-  CHECK_EQ(wshape.size(), 2);
+  CHECK_EQ(input_var_names.size(), 1u);
+  CHECK_EQ(wshape.size(), 2u);
   static int id = 0;
   std::string prefix = "fc_" + paddle::lite::to_string(id);
   auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
@@ -169,8 +169,8 @@ TEST(Subgraph, detect_simple_model) {
   };
   std::vector<std::vector<mir::Node*>> subgraphs =
       mir::SubgraphDetector(graph.get(), teller)();
-  ASSERT_EQ(subgraphs.size(), 1);
-  ASSERT_EQ(graph->nodes().size(), 9);
+  ASSERT_EQ(subgraphs.size(), 1u);
+  ASSERT_EQ(graph->nodes().size(), 9u);
   mir::SubgraphVisualizer(graph.get(), subgraphs)();
 }
 
@@ -221,7 +221,7 @@ TEST(Subgraph, detect_custom_model) {
   std::vector<std::vector<mir::Node*>> subgraphs =
       mir::SubgraphDetector(graph.get(), teller)();
   mir::SubgraphVisualizer(graph.get(), subgraphs)();
-  ASSERT_EQ(subgraphs.size(), 1);
+  ASSERT_EQ(subgraphs.size(), 1u);
 }
 
 }  // namespace lite
diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc
index eecd9348ae684929d3f55dee2a94921a078f148c..663b69d38843555095957f30d652ba8ef6216a0e 100644
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -40,6 +40,22 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   fuser();
 }
 
+void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) \
+  supported_lists.insert(#op_type);          \
+  LOG(INFO) << #op_type
+#include "lite/kernels/apu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
   std::unordered_set<std::string> supported_lists;
@@ -69,6 +85,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   fuser();
 }
 
+void RKNPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   std::unordered_set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
@@ -89,9 +119,13 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
     .BindTargets({TARGET(kNPU)});
+REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass)
+    .BindTargets({TARGET(kAPU)});
 REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
     .BindTargets({TARGET(kXPU)});
 REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
     .BindTargets({TARGET(kBM)});
+REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass)
+    .BindTargets({TARGET(kRKNPU)});
 REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
     .BindTargets({TARGET(kMLU)});
diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h
index f83448df42ffe6d6d8c5b37503b5127290037dce..8c2b501a62356c91e93f3c4ca91f70879d3c9229 100644
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass {
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
 
+class APUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 class XPUSubgraphPass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
@@ -37,6 +42,11 @@ class BMSubgraphPass : public ProgramPass {
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
 
+class RKNPUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 class MLUSubgraphPass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index a2369adc5d882310503cbf52fa5394098d824b40..c638793c08160eb8ee7edabeab0977541e85d82a 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -39,7 +39,7 @@ std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
   std::vector<std::vector<int64_t>> shapes;
   std::vector<std::string> shape_strings = Split(text, ":");
   shapes.resize(shape_strings.size());
-  for (int i = 0; i < shape_strings.size(); i++) {
+  for (size_t i = 0; i < shape_strings.size(); i++) {
     std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
     for (auto shape_num : shape_nums) {
       shapes[i].push_back(atoi(shape_num.c_str()));
@@ -66,7 +66,7 @@ void FillInputTensors(
   for (int j = 0; j < input_tensor_size; j++) {                \
     input_tensor_data[j] = static_cast<type>(value);           \
   }
-  for (int i = 0; i < input_tensor_shape.size(); i++) {
+  for (size_t i = 0; i < input_tensor_shape.size(); i++) {
     auto input_tensor = predictor->GetInput(i);
     input_tensor->Resize(input_tensor_shape[i]);
     auto input_tensor_size = ShapeProduction(input_tensor->shape());
@@ -95,7 +95,7 @@ void CheckOutputTensors(
             << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff;        \
     EXPECT_LT(rel_diff, 0.1);                                                 \
   }
-  for (int i = 0; i < output_tensor_type.size(); i++) {
+  for (size_t i = 0; i < output_tensor_type.size(); i++) {
     auto tar_output_tensor = tar_predictor->GetOutput(i);
     auto ref_output_tensor = ref_predictor->GetOutput(i);
     auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());
diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc
index ecccf89fa76287a3f30756f7138fcce229e8f337..121e64dc188eeb638becec3506b514bc24dad16d 100644
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -80,7 +80,7 @@ static bool InferScaleFromSubgraph(std::string var_name,
   auto input_or_output_scales = op_info->GetAttr<std::vector<float>>(attr_name);
   auto size = input_or_output_names.size();
   CHECK(size == input_or_output_scales.size());
-  for (int i = 0; i < size; i++) {
+  for (size_t i = 0; i < size; i++) {
     if (input_or_output_names[i] == var_name) {
       *scale = input_or_output_scales[i];
       return true;
@@ -137,18 +137,23 @@ void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     nodes.push_back(node);
   }
 
+  // record the copied node.
+  std::unordered_map<std::string, Node*> cast_nodes;
+
   for (auto& node : nodes) {
     if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
     auto inlinks = node->inlinks;
     for (auto* in : inlinks) {
-      ComplementInputs(graph.get(), node, in);
+      ComplementInputs(graph.get(), node, in, &cast_nodes);
     }
   }
 }
 
-void PrecisionCastPass::ComplementInputs(SSAGraph* graph,
-                                         Node* inst_node,
-                                         Node* in) {
+void PrecisionCastPass::ComplementInputs(
+    SSAGraph* graph,
+    Node* inst_node,
+    Node* in,
+    std::unordered_map<std::string, Node*>* cast_nodes) {
   // If this input is out of date.
   if (inst_node->inlinks.end() ==
       std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in))
@@ -184,16 +189,19 @@ void PrecisionCastPass::ComplementInputs(SSAGraph* graph,
                 in,
                 graph,
                 inst_node,
+                cast_nodes,
                 graph->valid_places());
   }
 }
 
-void PrecisionCastPass::AddCastInst(const Type& from,
-                                    const Type& to,
-                                    Node* in,
-                                    SSAGraph* graph,
-                                    Node* inst_node,
-                                    const std::vector<Place>& valid_places) {
+void PrecisionCastPass::AddCastInst(
+    const Type& from,
+    const Type& to,
+    Node* in,
+    SSAGraph* graph,
+    Node* inst_node,
+    std::unordered_map<std::string, Node*>* cast_nodes,
+    const std::vector<Place>& valid_places) {
   CHECK(!valid_places.empty()) << "valid_place should be set";
 
   // var -> new_transform_op -> new_var -> inst
@@ -203,66 +211,80 @@ void PrecisionCastPass::AddCastInst(const Type& from,
   auto cast_op_output_name = in->AsArg().name + "/precision_trans";
   // in->AsArg().name + "/precision_trans/" +
   // paddle::lite::to_string(node_id());
-  auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
-  cast_op_output_arg->AsArg().type =
-      LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
-  auto* cast_inst = graph->NewInstructNode();
+  if (cast_nodes->count(in->AsArg().name)) {
+    // Remove the old link
+    RemoveDirectedLink(in, inst_node);
+    // Update the original instruction OpDesc.
+    // Update its input to the cast_op_output_name
+    // Add new link, newarg->inst
+    DirectedLink(cast_nodes->at(in->AsArg().name),
+                 inst_node);  // [io_copy kernel]'s output -> [current kernel]
+    // reset opdesc and update kernel information
+    UpdateInputs(
+        inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
+  } else {
+    auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
+    cast_op_output_arg->AsArg().type =
+        LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
+    auto* cast_inst = graph->NewInstructNode();
 
-  // create Op and kernels.
-  bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
-  std::string cast_type = in_persist ? "calib_once" : "calib";
-  cast_op_output_arg->AsArg().is_persist = in_persist;
-  auto cast_op = LiteOpRegistry::Global().Create(cast_type);
-  CHECK(cast_op) << "create op [" << cast_op << "] failed";
+    // create Op and kernels.
+    bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
+    std::string cast_type = in_persist ? "calib_once" : "calib";
+    cast_op_output_arg->AsArg().is_persist = in_persist;
+    auto cast_op = LiteOpRegistry::Global().Create(cast_type);
+    CHECK(cast_op) << "create op [" << cast_op << "] failed";
 
-  // Create the new var manually.
-  inst_node->AsStmt().op()->scope()->Var(cast_op_output_name);
+    // Create the new var manually.
+    inst_node->AsStmt().op()->scope()->Var(cast_op_output_name);
 
-  // Create Calib Instruction.
-  cpp::OpDesc op_desc;
-  op_desc.SetType(cast_type);
-  op_desc.SetInput("Input", {in->AsArg().name});
-  op_desc.SetOutput("Out", {cast_op_output_name});
-  float scale;
-  if (InferScale(in, inst_node, &scale)) {
-    op_desc.SetAttr("scale", scale);
-  }
+    // Create Calib Instruction.
+    cpp::OpDesc op_desc;
+    op_desc.SetType(cast_type);
+    op_desc.SetInput("Input", {in->AsArg().name});
+    op_desc.SetOutput("Out", {cast_op_output_name});
+    float scale;
+    if (InferScale(in, inst_node, &scale)) {
+      op_desc.SetAttr("scale", scale);
+    }
 
-  cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
-  auto kernels = cast_op->CreateKernels(valid_places);
-  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
-  bool is_found = false;
-  for (auto& kernel : kernels) {
-    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
-    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-    if (TypeCompatible(*in_arg_ty, from) &&
-        out_arg_ty->precision() == to.precision()) {
-      is_found = true;
-      selected_kernels.emplace_back(std::move(kernel));
-      // we pick the kernel
-      cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op);
-      break;
+    cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+    auto kernels = cast_op->CreateKernels(valid_places);
+    std::vector<std::unique_ptr<KernelBase>> selected_kernels;
+    bool is_found = false;
+    for (auto& kernel : kernels) {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (TypeCompatible(*in_arg_ty, from) &&
+          out_arg_ty->precision() == to.precision()) {
+        is_found = true;
+        selected_kernels.emplace_back(std::move(kernel));
+        // we pick the kernel
+        cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op);
+        (*cast_nodes)[in->AsArg().name] = cast_op_output_arg;
+        break;
+      }
     }
-  }
 
-  CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":"
-                  << in->AsArg().name << "->" << to << ":"
-                  << inst_node->AsStmt().op_info()->Type();
+    CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":"
+                    << in->AsArg().name << "->" << to << ":"
+                    << inst_node->AsStmt().op_info()->Type();
 
-  // Remove the old link
-  RemoveDirectedLink(in, inst_node);
+    // Remove the old link
+    RemoveDirectedLink(in, inst_node);
 
-  // Update the original instruction OpDesc.
-  // Update its input to the io_copy_output_name
+    // Update the original instruction OpDesc.
+    // Update its input to the io_copy_output_name
 
-  // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
-  DirectedLink(in, cast_inst);
-  DirectedLink(cast_inst, cast_op_output_arg);
-  DirectedLink(cast_op_output_arg, inst_node);
+    // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
+    DirectedLink(in, cast_inst);
+    DirectedLink(cast_inst, cast_op_output_arg);
+    DirectedLink(cast_op_output_arg, inst_node);
 
-  // reset opdesc and update kernel information
-  UpdateInputs(
-      inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
+    // reset opdesc and update kernel information
+    UpdateInputs(
+        inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
+  }
 
   // recreate the op
   auto original_selected_kernel =
diff --git a/lite/core/mir/type_precision_cast_pass.h b/lite/core/mir/type_precision_cast_pass.h
index b5f7c5d902a998e369f0b1775c59f50cbf8dc256..d8d6af5fcd06c187029c7c16a74efade0d4bd5ca 100644
--- a/lite/core/mir/type_precision_cast_pass.h
+++ b/lite/core/mir/type_precision_cast_pass.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "lite/core/mir/pass.h"
 #include "lite/core/op_registry.h"
@@ -34,13 +35,17 @@ class PrecisionCastPass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 
-  void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in);
+  void ComplementInputs(SSAGraph* graph,
+                        Node* inst_node,
+                        Node* in,
+                        std::unordered_map<std::string, Node*>* cast_nodes);
 
   void AddCastInst(const Type& from,
                    const Type& to,
                    Node* in,
                    SSAGraph* graph,
                    Node* inst_node,
+                   std::unordered_map<std::string, Node*>* cast_nodes,
                    const std::vector<Place>& valid_places);
 
   void SetValidPlaces(const std::vector<Place>& valid_places);
diff --git a/lite/core/mir/weight_quantization_preprocess_pass.cc b/lite/core/mir/weight_quantization_preprocess_pass.cc
index c7889a54903f2a1d194fb3eade0bd92670b36699..2bb247871b9500129eeea855677a907cb4fd88b9 100644
--- a/lite/core/mir/weight_quantization_preprocess_pass.cc
+++ b/lite/core/mir/weight_quantization_preprocess_pass.cc
@@ -22,9 +22,29 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
+bool IsAbsMaxQuantizedOp(const OpInfo& op_info) {
+  bool result = false;
+  if (op_info.HasAttr("quantization_type") &&
+      op_info.GetAttr<std::string>("quantization_type") ==
+          "post_weight_abs_max") {
+    result = true;
+  } else if (!op_info.HasAttr("quantization_type") &&
+             op_info.HasAttr("quantize_weight_bits")) {  // Support older model,
+                                                         // save this for now
+    result = true;
+  }
+  return result;
+}
+
+/*
+ * For abs_max method in WeightQuantization, this pass obtains the scale value
+ * of conv2d, depthwise_conv2d and mul, expands the scale list, and save the
+ * list in the quantized ops.
+*/
 void WeightQuantizationPreprocessPass::Apply(
     const std::unique_ptr<SSAGraph>& graph) {
-  std::vector<std::string> weight_quantized_op = {"conv2d", "depthwise_conv2d"};
+  std::vector<std::string> weight_quantized_op = {
+      "conv2d", "depthwise_conv2d", "mul"};
   for (auto& node : graph->StmtTopologicalOrder()) {
     if (node->IsStmt() &&
         std::find(weight_quantized_op.begin(),
@@ -32,14 +52,20 @@ void WeightQuantizationPreprocessPass::Apply(
                   node->AsStmt().op_type()) != weight_quantized_op.end()) {
       auto* scope = node->stmt()->op()->scope();
       auto* op_desc = node->stmt()->mutable_op_info();
-      if (op_desc->HasAttr("quantize_weight_bits")) {
+      if (IsAbsMaxQuantizedOp(*op_desc)) {
         for (auto& input_name : op_desc->input_vars()) {
           std::string scale_name = input_name + "_quant_scale";
           if (op_desc->HasAttr(scale_name)) {
-            VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name;
+            VLOG(0) << " WeightQuantizationPreprocessPass op:"
+                    << op_desc->Type() << " input_name:" << input_name;
             auto input_tensor =
                 scope->FindVar(input_name)->GetMutable<lite::Tensor>();
-            int weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
+            int weight_out_channel;
+            if (op_desc->Type() == "mul") {
+              weight_out_channel = static_cast<int>(input_tensor->dims()[1]);
+            } else {
+              weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
+            }
             auto input_scale = op_desc->GetAttr<std::vector<float>>(scale_name);
             // scale length is equal to weight out channel
             std::vector<float> scale_list(weight_out_channel, input_scale[0]);
diff --git a/lite/core/mir/weight_quantization_preprocess_pass.h b/lite/core/mir/weight_quantization_preprocess_pass.h
index 76a35c6b443c692ec08688abd4c10680be62b8af..e7c9f03eef78bdafea204d30c78cf0d044bb15e9 100644
--- a/lite/core/mir/weight_quantization_preprocess_pass.h
+++ b/lite/core/mir/weight_quantization_preprocess_pass.h
@@ -25,8 +25,9 @@ namespace mir {
  * If the model is quantized by WeightQuantization in PostTrainingQuantization,
  * the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is
  * int, and the scale is save in the quantized ops.
- * WeightQuantizationPreprocessPass obtains the scale value, expands the
- * scale value to a list, and save the list in the quantized ops.
+ * For abs_max method in WeightQuantization, WeightQuantizationPreprocessPass
+ * obtains the scale value of conv2d, depthwise_conv2d and mul, expands the
+ * scale list, and save the list in the quantized ops.
  */
 class WeightQuantizationPreprocessPass : public ProgramPass {
  public:
diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc
index f8a706179374a0c86e28cf9a3638f5df2c932540..941a9e9f88cf04ef47487237b1a3f6509dea762b 100644
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -41,7 +41,7 @@ bool OpLite::InferShapeWithCache() {
        iter++) {
     // combined dims value into new_hash value.
     auto &element_dims = (*iter)->dims();
-    for (int i = 0; i < element_dims.size(); i++) {
+    for (size_t i = 0; i < element_dims.size(); i++) {
       new_hash =
           lite::hash_combine(new_hash, static_cast<int>(element_dims[i]));
     }
@@ -49,7 +49,7 @@ bool OpLite::InferShapeWithCache() {
     auto &emement_lods = (*iter)->lod();
     for (auto lod_iter = emement_lods.begin(); lod_iter != emement_lods.end();
          lod_iter++) {
-      for (int i = 0; i < lod_iter->size(); i++) {
+      for (size_t i = 0; i < lod_iter->size(); i++) {
         new_hash =
             lite::hash_combine(new_hash, static_cast<int>(lod_iter->at(i)));
       }
@@ -60,7 +60,7 @@ bool OpLite::InferShapeWithCache() {
     // if current hash value is consistent with io_shape_lod_hash_,
     // previous outputs shape and lod are reused.
     auto *current_outputs = param_.output_tensor_ptrs();
-    for (int i = 0; i < current_outputs->size(); i++) {
+    for (size_t i = 0; i < current_outputs->size(); i++) {
       current_outputs->at(i)->Resize(last_output_shapes[i]);
       current_outputs->at(i)->set_lod(last_output_lods[i]);
     }
@@ -69,7 +69,7 @@ bool OpLite::InferShapeWithCache() {
     io_shape_lod_hash_ = new_hash;
     this->InferShapeImpl();
     auto *current_outputs = param_.output_tensor_ptrs();
-    for (int i = 0; i < current_outputs->size(); i++) {
+    for (size_t i = 0; i < current_outputs->size(); i++) {
       last_output_shapes[i] = current_outputs->at(i)->dims();
       last_output_lods[i] = current_outputs->at(i)->lod();
     }
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
index 84f54b57b86c012ac72e367d657263b156e6c301..29c853c70caa80add9d47182da228a36f031cb42 100644
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -98,6 +98,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
     case TARGET(kNPU): {
       CREATE_KERNEL(kNPU);
     } break;
+    case TARGET(kAPU): {
+      CREATE_KERNEL(kAPU);
+    } break;
     case TARGET(kXPU): {
       CREATE_KERNEL(kXPU);
     } break;
@@ -110,6 +113,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
     case TARGET(kMLU): {
       CREATE_KERNEL(kMLU);
     } break;
+    case TARGET(kRKNPU): {
+      CREATE_KERNEL(kRKNPU);
+    } break;
     default:
       CHECK(false) << "not supported kernel target " << TargetToStr(target);
   }
@@ -151,16 +157,30 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kMLU, kInt16, kNHWC);
   INIT_FOR(kMLU, kInt16, kNCHW);
 
-  INIT_FOR(kHost, kFloat, kNCHW);
-  INIT_FOR(kHost, kInt32, kNCHW);
-  INIT_FOR(kHost, kInt64, kNCHW);
   INIT_FOR(kHost, kAny, kNCHW);
-  INIT_FOR(kHost, kFloat, kNHWC);
-  INIT_FOR(kHost, kFloat, kAny);
-  INIT_FOR(kHost, kAny, kNHWC);
-  INIT_FOR(kHost, kAny, kAny);
   INIT_FOR(kHost, kAny, kNHWC);
   INIT_FOR(kHost, kAny, kAny);
+  INIT_FOR(kHost, kBool, kNCHW);
+  INIT_FOR(kHost, kBool, kNHWC);
+  INIT_FOR(kHost, kBool, kAny);
+  INIT_FOR(kHost, kFloat, kNCHW);
+  INIT_FOR(kHost, kFloat, kNHWC);
+  INIT_FOR(kHost, kFloat, kAny);
+  INIT_FOR(kHost, kFP16, kNCHW);
+  INIT_FOR(kHost, kFP16, kNHWC);
+  INIT_FOR(kHost, kFP16, kAny);
+  INIT_FOR(kHost, kInt8, kNCHW);
+  INIT_FOR(kHost, kInt8, kNHWC);
+  INIT_FOR(kHost, kInt8, kAny);
+  INIT_FOR(kHost, kInt16, kNCHW);
+  INIT_FOR(kHost, kInt16, kNHWC);
+  INIT_FOR(kHost, kInt16, kAny);
+  INIT_FOR(kHost, kInt32, kNCHW);
+  INIT_FOR(kHost, kInt32, kNHWC);
+  INIT_FOR(kHost, kInt32, kAny);
+  INIT_FOR(kHost, kInt64, kNCHW);
+  INIT_FOR(kHost, kInt64, kNHWC);
+  INIT_FOR(kHost, kInt64, kAny);
 
   INIT_FOR(kX86, kFloat, kNCHW);
   INIT_FOR(kX86, kAny, kNCHW);
@@ -203,6 +223,7 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kNPU, kAny, kNHWC);
   INIT_FOR(kNPU, kAny, kAny);
 
+  INIT_FOR(kAPU, kInt8, kNCHW);
   INIT_FOR(kXPU, kFloat, kNCHW);
   INIT_FOR(kXPU, kInt8, kNCHW);
   INIT_FOR(kXPU, kAny, kNCHW);
@@ -218,6 +239,11 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kBM, kInt8, kNCHW);
   INIT_FOR(kBM, kAny, kNCHW);
   INIT_FOR(kBM, kAny, kAny);
+
+  INIT_FOR(kRKNPU, kFloat, kNCHW);
+  INIT_FOR(kRKNPU, kInt8, kNCHW);
+  INIT_FOR(kRKNPU, kAny, kNCHW);
+  INIT_FOR(kRKNPU, kAny, kAny);
 #undef INIT_FOR
 }
 
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index 96c9fc2358199594cf9590385c2efdaf1c671425..7d73155ac067da4bfd112661d9061c008c1ccef1 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -231,6 +231,9 @@ class KernelRegistry final {
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
 
+              KernelRegistryForTarget<TARGET(kAPU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
               KernelRegistryForTarget<TARGET(kXPU),
                                       PRECISION(kAny),
                                       DATALAYOUT(kAny)> *,  //
@@ -251,6 +254,16 @@ class KernelRegistry final {
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
 
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+
               KernelRegistryForTarget<TARGET(kFPGA),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW)> *,  //
@@ -435,32 +448,31 @@ class KernelRegistor : public lite::Registor<KernelType> {
 #define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \
   LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__)
 
-#define REGISTER_LITE_KERNEL(                                                  \
-    op_type__, target__, precision__, layout__, KernelClass, alias__)          \
-  static paddle::lite::KernelRegistor<TARGET(target__),                        \
-                                      PRECISION(precision__),                  \
-                                      DATALAYOUT(layout__),                    \
-                                      KernelClass>                             \
-      LITE_KERNEL_REGISTER_INSTANCE(                                           \
-          op_type__, target__, precision__, layout__, alias__)(#op_type__,     \
-                                                               #alias__);      \
-  static KernelClass LITE_KERNEL_INSTANCE(                                     \
-      op_type__, target__, precision__, layout__, alias__);                    \
-  int touch_##op_type__##target__##precision__##layout__##alias__() {          \
-    OpKernelInfoCollector::Global().AddKernel2path(                            \
-        #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__,  \
-        __FILE__);                                                             \
-    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__)  \
-        .Touch();                                                              \
-    return 0;                                                                  \
-  }                                                                            \
-  static bool LITE_KERNEL_PARAM_INSTANCE(                                      \
-      op_type__, target__, precision__, layout__, alias__)                     \
-      __attribute__((unused)) =                                                \
-          paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),       \
-                                                       PRECISION(precision__), \
-                                                       DATALAYOUT(layout__)>(  \
-              #op_type__ "/" #alias__)
+#define REGISTER_LITE_KERNEL(                                                 \
+    op_type__, target__, precision__, layout__, KernelClass, alias__)         \
+  static paddle::lite::KernelRegistor<TARGET(target__),                       \
+                                      PRECISION(precision__),                 \
+                                      DATALAYOUT(layout__),                   \
+                                      KernelClass>                            \
+      LITE_KERNEL_REGISTER_INSTANCE(                                          \
+          op_type__, target__, precision__, layout__, alias__)(#op_type__,    \
+                                                               #alias__);     \
+  static KernelClass LITE_KERNEL_INSTANCE(                                    \
+      op_type__, target__, precision__, layout__, alias__);                   \
+  int touch_##op_type__##target__##precision__##layout__##alias__() {         \
+    OpKernelInfoCollector::Global().AddKernel2path(                           \
+        #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \
+        __FILE__);                                                            \
+    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \
+        .Touch();                                                             \
+    return 0;                                                                 \
+  }                                                                           \
+  static bool LITE_KERNEL_PARAM_INSTANCE(                                     \
+      op_type__, target__, precision__, layout__, alias__) UNUSED =           \
+      paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),          \
+                                                   PRECISION(precision__),    \
+                                                   DATALAYOUT(layout__)>(     \
+          #op_type__ "/" #alias__)
 
 #define LITE_KERNEL_INSTANCE(                            \
     op_type__, target__, precision__, layout__, alias__) \
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 80c2bd553f6b8073e55d28ef0115246266a6a1c9..83df76f0230f666ec3857834e234afd921daa927 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -86,6 +86,8 @@ class Optimizer {
            "npu_subgraph_pass",
            "xpu_subgraph_pass",
            "bm_subgraph_pass",
+           "apu_subgraph_pass",
+           "rknpu_subgraph_pass",
            "static_kernel_pick_pass",        // pick original kernel from graph
            "variable_place_inference_pass",  // inference arg/var's
            // info(target/precision/layout/device)
@@ -127,7 +129,21 @@ class Optimizer {
            "memory_optimize_pass"}};
 
       if (passes.size() == 1) {
-        passes_local.push_back(passes[0]);
+        // multi_stream_analysis_pass must be in the front of
+        // runtime_context_assign_pass
+        const std::string msa_pass{"multi_stream_analysis_pass"};
+        const std::string depend_pass{"runtime_context_assign_pass"};
+        if (passes[0] == msa_pass) {
+          auto iter =
+              std::find(passes_local.begin(), passes_local.end(), depend_pass);
+          if (iter != passes_local.end()) {
+            passes_local.insert(iter, msa_pass);
+          } else {
+            CHECK(false) << "Not find " << depend_pass;
+          }
+        } else {
+          passes_local.push_back(passes[0]);
+        }
       }
       RunPasses(passes_local);
     } else {
diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h
index e72d1f54ee858ef10de83ceefb49addae6ea6606..ee581bf5e126f07fcdb1edeb9ab5b570df0c2ade 100644
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -178,6 +178,13 @@ class PrecisionProfiler {
           write_result_to_file&& write_tensorfile<int32_t>(in, name);
           return;
         }
+        case PRECISION(kInt64): {
+          auto ptr = in->data<int64_t>();
+          *mean = compute_mean<int64_t>(ptr, in->numel());
+          *std_dev = compute_standard_deviation<int64_t>(
+              ptr, in->numel(), true, *mean);
+          return;
+        }
         default:
           *mean = -333333333333;
           *std_dev = -33333333333;
diff --git a/lite/core/program.cc b/lite/core/program.cc
index ff900c0e23be9a06313babba51e3ce364295231a..5ddf6c0e935a851cc0b3c3eb7554609939ef1cbf 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -72,7 +72,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
   std::unordered_map<std::string, cpp::VarDesc> origin_var_maps;
   auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
   auto var_size = main_block.VarsSize();
-  for (int i = 0; i < var_size; i++) {
+  for (size_t i = 0; i < var_size; i++) {
     auto v = main_block.GetVar<cpp::VarDesc>(i);
     auto name = v->Name();
     origin_var_maps.emplace(name, *v);
@@ -145,6 +145,11 @@ void RuntimeProgram::Run() {
   for (auto& inst : instructions_) {
 #ifndef LITE_WITH_FPGA
     if (inst.is_feed_fetch_op()) continue;
+#endif
+#ifdef LITE_WITH_CUDA
+    if (inst.need_sync()) {
+      inst.Sync();
+    }
 #endif
     inst.Run();
 #ifdef LITE_WITH_PRECISION_PROFILE
diff --git a/lite/core/program.h b/lite/core/program.h
index c845a17c52c0c565e339a13e093f3e8f59e8d4a7..9d5fef7c0367d0e0fabf6ecff8b22e5e20a7bb57 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -108,6 +108,18 @@ struct Instruction {
 
   bool is_feed_fetch_op() const { return is_feed_fetch_op_; }
 
+#ifdef LITE_WITH_CUDA
+  bool need_sync() const {
+    if (kernel_->target() == TargetType::kCUDA) {
+      return kernel_->mutable_context()->As<CUDAContext>().need_sync();
+    } else {
+      // the io_copy kernel has synced, so cpu kernels don't need sync..
+      return false;
+    }
+  }
+  void Sync() const { kernel_->mutable_context()->As<CUDAContext>().Sync(); }
+#endif
+
 #ifdef LITE_WITH_PROFILE
   void set_profiler(profile::Profiler* profiler) {
     profiler_ = profiler;
diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc
index ecb9935dfd13c09cbd1a20f3833e6ab76161192a..1ae291dd40d19940e93bfda9b0c22f4092ce7988 100644
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -100,7 +100,7 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
 
 void TensorLite::ResetBuffer(std::shared_ptr<Buffer> buffer,
                              size_t memory_size) {
-  CHECK_EQ(offset_, 0)
+  CHECK_EQ(offset_, 0u)
       << "Only the offset is supported to zero when the Buffer is reset.";
   if (buffer_) {
     CHECK_LE(memory_size_, buffer->space())
diff --git a/lite/core/types.cc b/lite/core/types.cc
index 4ea383333d519ac2c481dce459ca49124a64df32..a19c5ed0a33986237ce03213875929d34a2fb363 100644
--- a/lite/core/types.cc
+++ b/lite/core/types.cc
@@ -67,31 +67,31 @@ STL::ostream& operator<<(STL::ostream& os, const KernelPickFactor& k) {
 
 template <>
 Type StdTypeToRepr<int32_t>() {
-  return Type::_int32;
+  return Type::INT32;
 }
 template <>
 Type StdTypeToRepr<int64_t>() {
-  return Type::_int64;
+  return Type::INT64;
 }
 template <>
 Type StdTypeToRepr<float>() {
-  return Type::_float32;
+  return Type::FLOAT32;
 }
 template <>
 Type StdTypeToRepr<double>() {
-  return Type::_float64;
+  return Type::Float64;
 }
 template <>
 Type StdTypeToRepr<std::vector<char>>() {
-  return Type::_char_list;
+  return Type::CHARLIST;
 }
 template <>
 Type StdTypeToRepr<std::string>() {
-  return Type::_string;
+  return Type::STRING;
 }
 template <>
 Type StdTypeToRepr<bool>() {
-  return Type::_bool;
+  return Type::BOOL;
 }
 
 }  // namespace core
diff --git a/lite/core/types.h b/lite/core/types.h
index 8f154f9dd509d3627750ecbf301923a2296252d1..66dc44746a7496d9805e8cc2b6bf2df89b33ddbf 100644
--- a/lite/core/types.h
+++ b/lite/core/types.h
@@ -29,23 +29,23 @@ namespace core {
  */
 // TODO(Superjomn) unify all the type representation across the lite framework.
 enum class Type {
-  _unk = -1,
-  // primary types
-  _int32,
-  _int64,
-  _float32,
-  _float64,
-  _bool,
-  _string,
+  UNK = -1,
+  // primary typesINT32,
+  INT32,
+  INT64,
+  FLOAT32,
+  Float64,
+  BOOL,
+  STRING,
   // primary list type
-  _char_list,
+  CHARLIST,
   // list types
-  _list,
+  LIST,
   // enum type
-  _enum,
-  _float16,
+  ENUM,
+  FLOAT16,
   // number of types
-  __num__,
+  NUM,
 };
 
 enum class FluidType {
@@ -81,7 +81,7 @@ enum class FluidType {
 
 template <typename T>
 Type StdTypeToRepr() {
-  return Type::_unk;
+  return Type::UNK;
 }
 template <>
 Type StdTypeToRepr<int32_t>();
@@ -92,6 +92,8 @@ Type StdTypeToRepr<float>();
 template <>
 Type StdTypeToRepr<bool>();
 template <>
+Type StdTypeToRepr<double>();
+template <>
 Type StdTypeToRepr<std::vector<char>>();
 template <>
 Type StdTypeToRepr<std::string>();
diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
index 0c9da1a76422edae45dfeec5d38556a5e2322a85..2a819883fa316bd1898c063912800b57804218db 100644
--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -18,6 +18,11 @@
 #include "paddle_api.h"         // NOLINT
 #include "paddle_use_passes.h"  // NOLINT
 
+#if defined(_WIN32)
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#endif
+
 using namespace paddle::lite_api;  // NOLINT
 
 DEFINE_string(model_dir, "", "Model dir path.");
diff --git a/lite/demo/python/mobilenetv1_full_api.py b/lite/demo/python/mobilenetv1_full_api.py
index a31469e3e8da81f3753dc5d241d4ef39ac03832f..c3a6bd077be5978f1ecaf9b040b119e50117d62b 100644
--- a/lite/demo/python/mobilenetv1_full_api.py
+++ b/lite/demo/python/mobilenetv1_full_api.py
@@ -23,7 +23,7 @@ import argparse
 import sys
 sys.path.append('../../python/lib')
 
-from lite_core import *
+from paddlelite.lite import *
 
 # Command arguments
 parser = argparse.ArgumentParser()
diff --git a/lite/demo/python/mobilenetv1_light_api.py b/lite/demo/python/mobilenetv1_light_api.py
index a44427092bae88aa41b3b1d0684cfcf36835b3d2..5847c7819366b654dd9d5b5cbe2108b54da7b04c 100644
--- a/lite/demo/python/mobilenetv1_light_api.py
+++ b/lite/demo/python/mobilenetv1_light_api.py
@@ -23,7 +23,7 @@ import argparse
 import sys
 sys.path.append('../../python/lib')
 
-from lite_core import *
+from paddlelite.lite import *
 
 # Command arguments
 parser = argparse.ArgumentParser()
diff --git a/lite/fluid/data_type.cc b/lite/fluid/data_type.cc
index d33a77c4bfcefbc349d453de05dcbb7c27707a19..9c96459993e55b441ea795c4f2cb58f40846c0d9 100644
--- a/lite/fluid/data_type.cc
+++ b/lite/fluid/data_type.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "lite/fluid/data_type.h"
 #include <stdint.h>
 #include <string>
diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt
index 40c95415546d99a66abf2d6f3595ae8695c4df86..2416278ad74068d28f6de523c55513891b08cc72 100644
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
         X86_DEPS ${x86_kernels}
         ARM_DEPS ${arm_kernels}
         NPU_DEPS ${npu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
         XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
@@ -43,6 +44,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
     X86_DEPS ${x86_kernels}
     ARM_DEPS ${arm_kernels}
     NPU_DEPS ${npu_kernels}
+    RKNPU_DEPS ${rknpu_kernels}
     XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
     FPGA_DEPS ${fpga_kernels}
diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt
index 78bb8d10b798b73861ddbf25e427289fc2984a55..17a836b17183d69b0e2a15b46b7a2097c323312f 100644
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -11,4 +11,6 @@ add_subdirectory(fpga)
 add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
+add_subdirectory(apu)
 add_subdirectory(bm)
+add_subdirectory(rknpu)
diff --git a/lite/kernels/apu/CMakeLists.txt b/lite/kernels/apu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..25182e2e20f9204e4dfd62b72c650ac0b07f3318
--- /dev/null
+++ b/lite/kernels/apu/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(bridges)
+
+add_kernel(subgraph_compute_apu APU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_apu subgraph_bridge_engine ${apu_subgraph_bridges})
diff --git a/lite/kernels/apu/bridges/CMakeLists.txt b/lite/kernels/apu/bridges/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3ac4670f04e0fc7711a898476c1f9bd0c016127c
--- /dev/null
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
@@ -0,0 +1,30 @@
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+
+
+lite_cc_library(subgraph_bridge_utility_apu SRCS utility.cc DEPS tensor)
+lite_cc_library(subgraph_bridge_graph_apu SRCS graph.cc DEPS subgraph_bridge_utility_apu)
+
+set(apu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_apu subgraph_bridge_graph_apu)
+
+lite_cc_library(subgraph_bridge_conv_op_apu SRCS conv_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_apu SRCS elementwise_ops.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
+
+
+set(apu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_apu
+        subgraph_bridge_conv_op_apu
+        subgraph_bridge_elementwise_ops_apu
+        subgraph_bridge_act_op_apu
+        subgraph_bridge_softmax_op_apu
+        subgraph_bridge_fc_op_apu
+        subgraph_bridge_pool_op_apu
+        CACHE INTERNAL "apu_subgraph_bridges")
+
+message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
diff --git a/lite/kernels/apu/bridges/act_op.cc b/lite/kernels/apu/bridges/act_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c2451d640eb52f6da88c4cd91bbf4ccd95f49152
--- /dev/null
+++ b/lite/kernels/apu/bridges/act_op.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+
+  return SUCCESS;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(relu, kAPU, paddle::lite::subgraph::apu::ActConverter);
diff --git a/lite/kernels/apu/bridges/conv_op.cc b/lite/kernels/apu/bridges/conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..859ad777ae58c3be0f36290adb47356f90c795ce
--- /dev/null
+++ b/lite/kernels/apu/bridges/conv_op.cc
@@ -0,0 +1,565 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include <iostream>
+#include <vector>
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  int neuron_errCode;
+
+  VLOG(3) << "[APU] Converting [" << op_type << "]";
+  auto libHandle = graph->libHandle();
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand)
+  LOAD_FUNCTIONS(
+      libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue)
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation)
+  LOAD_FUNCTIONS(libHandle,
+                 NeuronModel_setOperandSymmPerChannelQuantParams,
+                 neuron_model_setOperandSymmPerChannelQuantParams)
+
+  // Get input and output vars and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+
+  auto output_name = op_info->Output("Output").front();
+  auto output = scope->FindMutableTensor(output_name);
+  auto output_dims = output->dims();
+
+  auto bs = input_dims[0];
+  auto ic = input_dims[1];
+  auto oc = filter_dims[0];
+  CHECK_EQ(input_dims.size(), 4L);
+  CHECK_EQ(output_dims.size(), 4L);
+  CHECK_EQ(filter_dims.size(), 4L);
+  CHECK_EQ(output_dims[0], bs);
+  CHECK_EQ(output_dims[1], oc);
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  bool with_act =
+      op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
+  std::string act_type =
+      with_act ? op_info->GetAttr<std::string>("act_type") : "";
+  float leaky_relu_alpha = act_type == "leaky_relu"
+                               ? op_info->GetAttr<float>("leaky_relu_alpha")
+                               : 0.f;
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+  bool is_depthwise_mode = ic == groups && oc == groups;
+  VLOG(3) << "is_depthwise_mode" << is_depthwise_mode;
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+
+  CHECK_EQ(paddings.size(), 4L)
+      << "[APU] Paddings size should be the same or twice as the input size."
+      << paddings.size();
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
+  float input_scale;
+  float output_scale;
+  std::vector<float> weight_scale;
+  if (op_info->HasAttr("enable_int8")) {
+    if (op_info->GetAttr<bool>("enable_int8")) {
+      if (op_info->HasAttr("input_scale"))
+        input_scale = op_info->GetAttr<float>("input_scale");
+      if (op_info->HasAttr("weight_scale"))
+        weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+      if (op_info->HasAttr("output_scale"))
+        output_scale = op_info->GetAttr<float>("output_scale");
+      VLOG(3) << "has output scale:" << output_scale;
+    } else {
+      return FAILED;
+    }
+  } else {
+    return FAILED;
+  }
+
+  VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
+          << " ,dilations: " << dilations[0] << ":" << dilations[1];
+  VLOG(3) << "with_act: " << with_act << " ,act_type:" << act_type;
+  VLOG(3) << "input_dims: " << input_dims << " ,output_dims: " << output_dims
+          << " ,weight_scale size: " << weight_scale.size();
+  VLOG(3) << "filter_dims: " << filter_dims
+          << " ,memory_size: " << filter->memory_size()
+          << " ,data_size: " << filter->data_size();
+
+  // Add input tensor type
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = input_scale;
+  inType.zeroPoint = 128;
+  inType.dimensionCount = input_dims.size();
+  std::vector<uint32_t> dims_in = {(uint32_t)input_dims[0],
+                                   (uint32_t)input_dims[2],
+                                   (uint32_t)input_dims[3],
+                                   (uint32_t)input_dims[1]};
+  inType.dimensions = &dims_in[0];
+
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    VLOG(3) << "Graph has " << input_name;
+    // input operand already exist
+    input_node = graph->Get(input_name);
+  } else {
+    // add input operand
+    if (graph->IsInput(input_name)) {
+      // Insert transpose for NCHW -> NHWC
+      insert_transpose_node(
+          ctx,
+          input_name,
+          "transpose_" + input_name,
+          {input_dims[0], input_dims[1], input_dims[2], input_dims[3]},
+          dims_in,
+          {0, 2, 3, 1},
+          inType.scale,
+          inType.zeroPoint);
+
+      // change input_name
+      input_name = "transpose_" + input_name;
+      input_node = graph->Get(input_name);
+      if (input_node == nullptr) return subgraph::FAILED;
+    } else {
+      (*neuron_model_addOperand)(model, &inType);  // input
+      input_node = graph->Add(input_name, dims_in);
+    }
+  }
+  VLOG(3) << "input node idx" << input_node->index()
+          << ": input_scale: " << input_scale
+          << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1]
+          << ":" << inType.dimensions[2] << ":" << inType.dimensions[3];
+
+  // Add bias type
+  NeuronOperandType biasType;
+
+  // Add filter type
+  // filter NCHW -> NHWC
+  Tensor transpose_filter;
+  std::vector<uint32_t> dims_filter;
+
+  if (is_depthwise_mode) {
+    transpose_filter.Resize({1,
+                             (uint32_t)filter_dims[2],
+                             (uint32_t)filter_dims[3],
+                             (uint32_t)filter_dims[0]});
+    dims_filter = {1,
+                   (uint32_t)filter_dims[0],
+                   (uint32_t)filter_dims[2],
+                   (uint32_t)filter_dims[3]};
+    transpose(filter->data<int8_t>(),
+              transpose_filter.mutable_data<uint8_t>(),
+              dims_filter,
+              {0, 2, 3, 1});
+
+    dims_filter = {(uint32_t)filter_dims[1],
+                   (uint32_t)filter_dims[2],
+                   (uint32_t)filter_dims[3],
+                   (uint32_t)filter_dims[0]};
+  } else {
+    transpose_filter.Resize({(uint32_t)filter_dims[0],
+                             (uint32_t)filter_dims[2],
+                             (uint32_t)filter_dims[3],
+                             (uint32_t)filter_dims[1]});
+    dims_filter = {(uint32_t)filter_dims[0],
+                   (uint32_t)filter_dims[1],
+                   (uint32_t)filter_dims[2],
+                   (uint32_t)filter_dims[3]};
+    transpose(filter->data<int8_t>(),
+              transpose_filter.mutable_data<uint8_t>(),
+              dims_filter,
+              {0, 2, 3, 1});
+
+    dims_filter = {(uint32_t)filter_dims[0],
+                   (uint32_t)filter_dims[2],
+                   (uint32_t)filter_dims[3],
+                   (uint32_t)filter_dims[1]};
+  }
+
+  NeuronOperandType filterType;
+  NeuronOperandType channelFilterType;
+  NeuronSymmPerChannelQuantParams symmPerChannelQuantParams;
+  if (1 == weight_scale.size()) {
+    // Per layer type
+    filterType.type = NEURON_TENSOR_QUANT8_ASYMM;
+    filterType.scale = weight_scale[0];
+    filterType.zeroPoint = 128;
+    filterType.dimensionCount = filter_dims.size();
+    filterType.dimensions = &dims_filter[0];
+    biasType.scale = inType.scale * filterType.scale;
+  } else {
+    // Per channel type
+    channelFilterType.type = NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL;
+    channelFilterType.scale = 0.0f;
+    channelFilterType.zeroPoint = 0;
+    channelFilterType.dimensionCount = filter_dims.size();
+    channelFilterType.dimensions = &dims_filter[0];
+
+    // Per channel setting
+    if (is_depthwise_mode)
+      symmPerChannelQuantParams.channelDim = 3;
+    else
+      symmPerChannelQuantParams.channelDim = 0;
+    symmPerChannelQuantParams.scaleCount = weight_scale.size();
+    symmPerChannelQuantParams.scales = weight_scale.data();
+    biasType.scale = 0;
+  }
+
+  std::shared_ptr<Node> filter_node = nullptr;
+  if (1 == weight_scale.size()) {
+    (*neuron_model_addOperand)(model, &filterType);  // 1: filter
+    filter_node = graph->Add(filter_name, dims_filter);
+    VLOG(3) << "filter node idx: " << filter_node->index() << "w_scale[0]"
+            << weight_scale[0] << ": filterType: " << filterType.dimensions[0]
+            << ":" << filterType.dimensions[1] << ":"
+            << filterType.dimensions[2] << ":" << filterType.dimensions[3];
+    memcpy(filter->mutable_data<int8_t>(),
+           transpose_filter.mutable_data<uint8_t>(),
+           filter->memory_size());
+    neuron_errCode = (*neuron_model_setOperandValue)(
+        model, filter_node->index(), filter->raw_data(), filter->memory_size());
+    if (NEURON_NO_ERROR != neuron_errCode) {
+      LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode;
+      return subgraph::FAILED;
+    }
+  } else {
+    (*neuron_model_addOperand)(model, &channelFilterType);  // 1: filter
+    filter_node = graph->Add(filter_name, dims_filter);
+    VLOG(3) << "chennel filter node idx: " << filter_node->index()
+            << " ,scale_count:" << weight_scale.size()
+            << " weight_scale[0]:" << weight_scale.data()[0]
+            << " ,channelFilterType: " << channelFilterType.dimensions[0] << ":"
+            << channelFilterType.dimensions[1] << ":"
+            << channelFilterType.dimensions[2] << ":"
+            << channelFilterType.dimensions[3];
+    memcpy(filter->mutable_data<int8_t>(),
+           transpose_filter.mutable_data<uint8_t>(),
+           filter->memory_size());
+    neuron_errCode = (*neuron_model_setOperandValue)(
+        model, filter_node->index(), filter->raw_data(), filter->memory_size());
+    if (NEURON_NO_ERROR != neuron_errCode) {
+      LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode;
+      return subgraph::FAILED;
+    }
+    neuron_errCode = (*neuron_model_setOperandSymmPerChannelQuantParams)(
+        model, filter_node->index(), &symmPerChannelQuantParams);
+    if (NEURON_NO_ERROR != neuron_errCode) {
+      LOG(WARNING) << "Set per channel filter params fail:" << neuron_errCode;
+      return subgraph::FAILED;
+    }
+  }
+
+  // Add biasType node value
+  // A 1-D tensor, of shape [depth_out], specifying the bias.
+  // For filter tensor of NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL, the bias
+  // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0
+  // and bias_scale of 0. The actual scale of each value 'i' is equal
+  // to bias_scale[i] = input_scale * filter_scale[i].
+  biasType.type = NEURON_TENSOR_INT32;
+  biasType.zeroPoint = 0;
+  std::vector<uint32_t> dims_bias;
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias_type = kernel->GetInputDeclType("Bias");
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+
+    biasType.dimensionCount = bias_dims.size();
+    for (int i = 0; i < bias_dims.size(); i++)
+      dims_bias.push_back(bias_dims[i]);
+    biasType.dimensions = &dims_bias[0];
+    (*neuron_model_addOperand)(model, &biasType);  // 2: bias
+    bias_node = graph->Add(bias_name, dims_bias);
+    VLOG(3) << "node idx" << bias_node->index() << ": Bias name: " << bias_name
+            << " ,bias scale: " << biasType.scale
+            << " ,dimensions: " << bias_dims;
+  } else {
+    biasType.dimensionCount = 1;
+    dims_bias = {(uint32_t)output_dims[1]};
+    biasType.dimensions = &dims_bias[0];
+    (*neuron_model_addOperand)(model, &biasType);  // 2: bias
+    bias_node = graph->Add(filter_name + "_default_bias", dims_bias);
+    VLOG(3) << "node idx" << bias_node->index() << ": Bias name: default_bias "
+            << " ,bias scale: " << biasType.scale
+            << " ,dimensions: " << dims_bias.size();
+  }
+
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+
+  std::shared_ptr<Node> paddingL_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 3: padding left
+  paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32);
+
+  std::shared_ptr<Node> paddingR_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 4: padding right
+  paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32);
+
+  std::shared_ptr<Node> paddingT_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 5: padding top
+  paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32);
+
+  std::shared_ptr<Node> paddingB_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 6: padding bottom
+  paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32);
+
+  std::shared_ptr<Node> strideW_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 7: stride width
+  strideW_node = graph->Add(filter_name + "_stride_width", dims_int32);
+
+  std::shared_ptr<Node> strideH_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 8: stride height
+  strideH_node = graph->Add(filter_name + "_stride_height", dims_int32);
+
+  std::shared_ptr<Node> dm_node = nullptr;
+  if (is_depthwise_mode) {
+    (*neuron_model_addOperand)(model, &int32Type);  // 9: depthwise multiplier
+    dm_node = graph->Add(filter_name + "_dm", dims_int32);
+  }
+
+  std::shared_ptr<Node> fuse_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 9/10: fuse
+  fuse_node = graph->Add(filter_name + "_fuse", dims_int32);
+
+  // Add output tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  if (graph->IsOutput(output_name))
+    outType.scale = output_scale / 127;
+  else
+    outType.scale = output_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = output_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)output_dims[0],
+                                    (uint32_t)output_dims[2],
+                                    (uint32_t)output_dims[3],
+                                    (uint32_t)output_dims[1]};
+  outType.dimensions = &dims_out[0];
+  std::shared_ptr<Node> output_node = nullptr;
+  if (graph->Has(output_name)) {
+    output_node = graph->Get(output_name);
+  } else {
+    // add output operand
+    if (graph->IsOutput(output_name)) {
+      (*neuron_model_addOperand)(model, &outType);  // output
+      output_node = graph->Add("transpose_" + output_name, dims_out);
+    } else {
+      (*neuron_model_addOperand)(model, &outType);  // output
+      output_node = graph->Add(output_name, dims_out);
+    }
+  }
+  VLOG(3) << "output node idx: " << output_node->index()
+          << ": output_scale: " << outType.scale
+          << ", outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Add bias value
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+    int32_t* int32_bias_data =
+        reinterpret_cast<int32_t*>(bias->mutable_data<float>());
+    float2int32(
+        bias->data<float>(), input_scale, weight_scale, int32_bias_data);
+
+    VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << " : "
+            << int32_bias_data[1] << " : " << int32_bias_data[2] << " : "
+            << int32_bias_data[3];
+    neuron_errCode = (*neuron_model_setOperandValue)(
+        model, bias_node->index(), bias->raw_data(), bias->memory_size());
+  } else {
+    auto int32_bias = std::make_shared<Tensor>();
+    int32_bias->Resize({1, output_dims[1]});
+    int32_bias->mutable_data<int32_t>();
+    VLOG(3) << "bais_default: " << int32_bias->memory_size();
+    memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
+    neuron_errCode = (*neuron_model_setOperandValue)(model,
+                                                     bias_node->index(),
+                                                     int32_bias->raw_data(),
+                                                     int32_bias->memory_size());
+    bias_node->set_data(int32_bias);
+  }
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set bias operand value fail:" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+
+  VLOG(3) << "paddings: " << paddings[0] << ":" << paddings[1] << ":"
+          << paddings[2] << ":" << paddings[3];
+  // Add padding value
+  int32_t padding_val[1];
+  padding_val[0] = paddings[2];
+  (*neuron_model_setOperandValue)(
+      model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[3];
+  (*neuron_model_setOperandValue)(
+      model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[0];
+  (*neuron_model_setOperandValue)(
+      model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[1];
+  (*neuron_model_setOperandValue)(
+      model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1);
+
+  VLOG(3) << " stride width:" << strides[1] << " height:" << strides[0];
+
+  // Add Stride
+  int32_t stride_val[1];
+  stride_val[0] = strides[1];  // width
+  (*neuron_model_setOperandValue)(
+      model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
+  stride_val[0] = strides[0];  // height
+  (*neuron_model_setOperandValue)(
+      model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
+
+  // Add fuse
+  int32_t fuse_val[1] = {0};
+  if (act_type == "relu") {
+    fuse_val[0] = 1;
+  } else if (act_type == "relu1") {
+    fuse_val[0] = 2;
+  } else if (act_type == "relu6") {
+    fuse_val[0] = 3;
+  } else if (!act_type.empty()) {
+    fuse_val[0] = 0;
+    LOG(WARNING) << "Support act_type: " << act_type;
+    return FAILED;
+  }
+
+  if (is_depthwise_mode) {
+    int32_t dm = oc / ic;
+    (*neuron_model_setOperandValue)(
+        model, dm_node->index(), &dm, sizeof(int32_t) * 1);
+    VLOG(3) << "depthwise multiplier:" << dm;
+
+    // Depthwise conv
+    (*neuron_model_setOperandValue)(
+        model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+    std::vector<uint32_t> addInIndex = {
+        input_node->index(),     // 0: input
+        filter_node->index(),    // 1: filter
+        bias_node->index(),      // 2: bias
+        paddingL_node->index(),  // 3: padding left
+        paddingR_node->index(),  // 4: padding right
+        paddingT_node->index(),  // 5: padding top
+        paddingB_node->index(),  // 6: padding bottom
+        strideW_node->index(),   // 7: stride width
+        strideH_node->index(),   // 8: stride height
+        dm_node->index(),        // 9: depthwise multiplier
+        fuse_node->index()};     // 10 : fuse
+
+    std::vector<uint32_t> addOutIndex = {output_node->index()};
+    neuron_errCode = (*neuron_model_addOperation)(model,
+                                                  NEURON_DEPTHWISE_CONV_2D,
+                                                  addInIndex.size(),
+                                                  &addInIndex[0],
+                                                  addOutIndex.size(),
+                                                  &addOutIndex[0]);
+  } else {
+    (*neuron_model_setOperandValue)(
+        model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+    std::vector<uint32_t> addInIndex = {
+        input_node->index(),     // 0: input
+        filter_node->index(),    // 1: filter
+        bias_node->index(),      // 2: bias
+        paddingL_node->index(),  // 3: padding left
+        paddingR_node->index(),  // 4: padding right
+        paddingT_node->index(),  // 5: padding top
+        paddingB_node->index(),  // 6: padding bottom
+        strideW_node->index(),   // 7: stride width
+        strideH_node->index(),   // 8: stride height
+        fuse_node->index()};     // 9: fuse
+
+    std::vector<uint32_t> addOutIndex = {output_node->index()};
+    neuron_errCode = (*neuron_model_addOperation)(model,
+                                                  NEURON_CONV_2D,
+                                                  addInIndex.size(),
+                                                  &addInIndex[0],
+                                                  addOutIndex.size(),
+                                                  &addOutIndex[0]);
+  }
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return FAILED;
+  }
+
+  if (graph->IsOutput(output_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(
+        ctx,
+        "transpose_" + output_name,
+        output_name,
+        dims_out,
+        {output_dims[0], output_dims[1], output_dims[2], output_dims[3]},
+        {0, 3, 1, 2},
+        outType.scale,
+        outType.zeroPoint);
+    output_node = graph->Get(output_name);
+    if (output_node == nullptr) return subgraph::FAILED;
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ConvConverter);
diff --git a/lite/kernels/apu/bridges/elementwise_ops.cc b/lite/kernels/apu/bridges/elementwise_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c637e0fe746ce2a4d2b42dc902d62279967e73c
--- /dev/null
+++ b/lite/kernels/apu/bridges/elementwise_ops.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto y_name = op_info->Input("Y").front();
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+  auto axis = op_info->GetAttr<int>("axis");
+
+  // Act node
+  if (op_type == "fusion_elementwise_add_activation" ||
+      op_type == "fusion_elementwise_sub_activation" ||
+      op_type == "fusion_elementwise_mul_activation" ||
+      op_type == "fusion_elementwise_div_activation") {
+    auto act_type = op_info->GetAttr<std::string>("act_type");
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ElementwiseConverter);
diff --git a/lite/kernels/apu/bridges/fc_op.cc b/lite/kernels/apu/bridges/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d4ffc762e287618c8eb6b31908909cca4af91d1
--- /dev/null
+++ b/lite/kernels/apu/bridges/fc_op.cc
@@ -0,0 +1,250 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting [" + op_type + "]";
+
+  auto libHandle = graph->libHandle();
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand)
+  LOAD_FUNCTIONS(
+      libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue)
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation)
+
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  CHECK_GE(input_dims.size(), 2UL);
+  auto w_name = op_info->Input("W").front();
+  auto w = scope->FindMutableTensor(w_name);
+  auto w_dims = w->dims();
+  CHECK_EQ(w_dims.size(), 2UL);
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+
+  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  int m = input_dims.Slice(0, in_num_col_dims).production();
+  int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
+  int n = w_dims[1];
+  CHECK_EQ(k * n, w_dims.production());
+  VLOG(3) << "[APU] input dims: " << input_dims << " w dims: " << w_dims
+          << " out_dims: " << out_dims << " m: " << m << " k: " << k
+          << " n: " << n;
+
+  float input_scale = 1.0f;
+  float out_scale = 1.0f;
+  std::vector<float> w_scale;
+  if (op_info->HasAttr("enable_int8")) {
+    if (op_info->GetAttr<bool>("enable_int8")) {
+      if (op_info->HasAttr("input_scale"))
+        input_scale = op_info->GetAttr<float>("input_scale");
+      if (op_info->HasAttr("weight_scale"))
+        w_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+      if (op_info->HasAttr("output_scale"))
+        out_scale = op_info->GetAttr<float>("output_scale");
+    } else {
+      return FAILED;
+    }
+  } else {
+    return FAILED;
+  }
+
+  // Add input tensor type
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = input_scale;
+  inType.zeroPoint = 128;
+  inType.dimensionCount = input_dims.size();
+  std::vector<uint32_t> dims_in = {(uint32_t)input_dims[0],
+                                   (uint32_t)input_dims[2],
+                                   (uint32_t)input_dims[3],
+                                   (uint32_t)input_dims[1]};
+
+  inType.dimensions = &dims_in[0];
+  std::shared_ptr<Node> in_node = nullptr;
+  if (graph->Has(input_name)) {
+    // input operand already exist
+    in_node = graph->Get(input_name);
+    VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index();
+  } else {
+    // add input operand
+    (*neuron_model_addOperand)(model, &inType);  // 0: input
+    in_node = graph->Add(input_name, dims_in);
+  }
+  VLOG(3) << "input_scale: " << input_scale
+          << ", inType: " << inType.dimensions[0] << " : "
+          << inType.dimensions[1] << " : " << inType.dimensions[2] << " : "
+          << inType.dimensions[3];
+
+  NeuronOperandType wType;
+  wType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  wType.scale = w_scale[0];
+  wType.zeroPoint = 128;
+  wType.dimensionCount = w_dims.size();
+  std::vector<uint32_t> dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]};
+  wType.dimensions = &dims_w[0];
+  (*neuron_model_addOperand)(model, &wType);  // 1: weight
+  std::shared_ptr<Node> w_node = nullptr;
+  w_node = graph->Add(w_name, dims_w);
+  VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0]
+          << ", wType dimensions: " << wType.dimensions[0] << " : "
+          << wType.dimensions[1] << ", memory size: " << w->memory_size();
+
+  // Add bias type
+  NeuronOperandType biasType;
+  biasType.type = NEURON_TENSOR_INT32;
+  biasType.zeroPoint = 0;
+  biasType.scale = input_scale * w_scale[0];
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias_type = kernel->GetInputDeclType("Bias");
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+
+    biasType.dimensionCount = bias_dims.size();
+    std::vector<uint32_t> dims_bias = {(uint32_t)bias_dims[0]};
+    biasType.dimensions = &dims_bias[0];
+    (*neuron_model_addOperand)(model, &biasType);  // 2: bias
+    bias_node = graph->Add(bias_name, dims_bias);
+    VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims
+            << ", bias scale: " << biasType.scale
+            << " ,memory size: " << bias->memory_size();
+  } else {
+    biasType.dimensionCount = 1;
+    std::vector<uint32_t> dims_bias = {(uint32_t)n};
+    biasType.dimensions = &dims_bias[0];
+    (*neuron_model_addOperand)(model, &biasType);  // 2: bias
+    bias_node = graph->Add(w_name + "_default_bias", dims_bias);
+  }
+
+  // Add fuse type
+  NeuronOperandType fuseType;
+  fuseType.type = NEURON_INT32;
+  fuseType.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {0};
+  (*neuron_model_addOperand)(model, &fuseType);  // 3: fuse
+  std::shared_ptr<Node> fuse_node = nullptr;
+  fuse_node = graph->Add(w_name + "_fuse", dims_int32);
+
+  // Add output tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = out_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = 2;
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0], out_dims[1]};
+  outType.dimensions = &dims_out[0];
+  VLOG(3) << "out_scale: " << out_scale
+          << ", outType: " << outType.dimensions[0] << " : "
+          << outType.dimensions[1];
+  (*neuron_model_addOperand)(model, &outType);  // output
+  std::shared_ptr<Node> out_node = nullptr;
+  out_node = graph->Add(out_name, dims_out);
+
+  int8_t* w_data = w->mutable_data<int8_t>();
+  Tensor transpose_filter;
+  // Original dimension
+  transpose_filter.Resize({(uint32_t)w_dims[1], (uint32_t)w_dims[0]});
+  transpose_filter.mutable_data<uint8_t>();
+  transposeAsym(w->data<int8_t>(),
+                transpose_filter.mutable_data<uint8_t>(),
+                {1, 1, (uint32_t)w_dims[0], (uint32_t)w_dims[1]},
+                {0, 1, 3, 2});
+  memcpy(w->mutable_data<int8_t>(),
+         transpose_filter.mutable_data<uint8_t>(),
+         w->memory_size());
+  int neuron_errCode = (*neuron_model_setOperandValue)(
+      model, w_node->index(), w->raw_data(), w->memory_size());
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set W operand value fail:" << neuron_errCode
+                 << ",index: " << w_node->index();
+    return FAILED;
+  }
+
+  // Add bias if bias tensor exists
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+    int32_t* int32_bias_data =
+        reinterpret_cast<int32_t*>(bias->mutable_data<float>());
+    float2int32(bias->data<float>(), input_scale, w_scale, int32_bias_data);
+
+    VLOG(3) << int32_bias_data[0] << ":" << int32_bias_data[1] << ":"
+            << int32_bias_data[2] << ":" << int32_bias_data[3];
+    neuron_errCode =
+        (*neuron_model_setOperandValue)(model,
+                                        bias_node->index(),
+                                        bias->raw_data(),
+                                        bias->memory_size());  // 2: bias
+  } else {
+    auto int32_bias = std::make_shared<Tensor>();
+    int32_bias->Resize({1, out_dims[1]});
+    int32_bias->mutable_data<int32_t>();
+    memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
+    VLOG(3) << "default: " << int32_bias->memory_size();
+    neuron_errCode =
+        (*neuron_model_setOperandValue)(model,
+                                        bias_node->index(),
+                                        int32_bias->raw_data(),
+                                        int32_bias->memory_size());  // 2: bias
+    bias_node->set_data(int32_bias);
+  }
+  // Add fuse value
+  int32_t fuse_val[1] = {0};
+  (*neuron_model_setOperandValue)(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);  // 3: fuse
+
+  std::vector<uint32_t> addInIndex = {in_node->index(),
+                                      w_node->index(),
+                                      bias_node->index(),
+                                      fuse_node->index()};
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  neuron_errCode = (*neuron_model_addOperation)(model,
+                                                NEURON_FULLY_CONNECTED,
+                                                addInIndex.size(),
+                                                &addInIndex[0],
+                                                addOutIndex.size(),
+                                                &addOutIndex[0]);
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return FAILED;
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(fc, kAPU, paddle::lite::subgraph::apu::FCConverter);
diff --git a/lite/kernels/apu/bridges/graph.cc b/lite/kernels/apu/bridges/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..515853aa26a1d84339c61047b5d3be20478b5ca3
--- /dev/null
+++ b/lite/kernels/apu/bridges/graph.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/apu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+
+  if (it != nodes_.end()) {
+    LOG(FATAL) << "[APU] Node" << name << " is redefined.";
+    return -1;
+  } else {
+    VLOG(3) << " Add: " << name << " : " << node->index();
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  operandIdx_ += 1;
+  it->second.push_back(node);
+
+  return it->second.size();
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/apu/bridges/graph.h b/lite/kernels/apu/bridges/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..857800abddbebb411fa607ecbf6a8b2dff702b2b
--- /dev/null
+++ b/lite/kernels/apu/bridges/graph.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "NeuronAdapter.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+// Graph and node is defined to collect all of converted HiAI IR nodes
+class Node {
+ public:
+  Node(int32_t operand_idx, std::vector<uint32_t> shape)
+      : idx_(operand_idx), shape_(shape) {}
+
+  void set_shape(std::vector<uint32_t> shape) { shape_ = shape; }
+
+  uint32_t index() { return idx_; }
+  std::vector<uint32_t> shape() const { return shape_; }
+  void set_data(std::shared_ptr<Tensor> data) { data_ = data; }
+
+ private:
+  int32_t idx_;
+  std::vector<uint32_t> shape_;
+  std::shared_ptr<Tensor> data_{nullptr};
+};
+
+class Graph {
+ public:
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Variable, const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<uint32_t> shape) {
+    CHECK(shape.size()) << name << " : " << shape.size();
+    auto node = std::make_shared<Node>(operandIdx_, shape);
+    auto idx = Add(name, node);
+    CHECK_GE(idx, 1);
+
+    return node;
+  }
+
+  void set_model(NeuronModel* model) { model_ = model; }
+  NeuronModel* model() { return model_; }
+
+  void set_libHandle(void* libHandle) { libHandle_ = libHandle; }
+  void* libHandle() { return libHandle_; }
+
+  void set_input_names(const std::vector<std::string> input_names) {
+    input_names_ = input_names;
+  }
+
+  bool IsInput(const std::string& name) {
+    for (int i = 0; i < input_names_.size(); i++) {
+      if (input_names_[i] == name) return true;
+    }
+    return false;
+  }
+
+  bool IsOutput(const std::string& name) {
+    for (int i = 0; i < output_names_.size(); i++) {
+      if (output_names_[i] == name) return true;
+    }
+    return false;
+  }
+
+  void set_output_names(const std::vector<std::string> output_names) {
+    output_names_ = output_names;
+  }
+
+  std::shared_ptr<Node> Get(std::string name) {
+    CHECK(Has(name)) << "[APU] Node " << name << " not found.";
+    return nodes_.at(name).back();
+  }
+
+  bool Has(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+ private:
+  void* libHandle_;
+  NeuronModel* model_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
+  int32_t operandIdx_ = 0;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
+};
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/apu/bridges/paddle_use_bridges.h b/lite/kernels/apu/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3e68afc6c7c18d2b8d68361ac09de2abf2b684c
--- /dev/null
+++ b/lite/kernels/apu/bridges/paddle_use_bridges.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(relu, kAPU);
+USE_SUBGRAPH_BRIDGE(conv2d, kAPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kAPU);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kAPU);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU);
+USE_SUBGRAPH_BRIDGE(fc, kAPU);
+USE_SUBGRAPH_BRIDGE(pool2d, kAPU);
+USE_SUBGRAPH_BRIDGE(softmax, kAPU);
diff --git a/lite/kernels/apu/bridges/pool_op.cc b/lite/kernels/apu/bridges/pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d17ba7a433f5367328f3826d815c65bd75a6f9a
--- /dev/null
+++ b/lite/kernels/apu/bridges/pool_op.cc
@@ -0,0 +1,279 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pool_op.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting [" + op_type + "] ";
+
+  auto libHandle = graph->libHandle();
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand)
+  LOAD_FUNCTIONS(
+      libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue)
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation)
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+
+  // pool mode
+  if ((pooling_type == "max") || (pooling_type == "avg")) {
+  } else {
+    LOG(WARNING) << "[APU] Unsupported pooling type: " << pooling_type;
+    return FAILED;
+  }
+
+  // pad mode
+  int pad_mode = 0;
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  if (padding_algorithm == "SAME") {
+    pad_mode = 6;
+  } else if (padding_algorithm == "VALID") {
+    pad_mode = 5;
+  }
+
+  // paddings and strides
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[APU] Paddings size should be the same or twice as the inputs size.";
+
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  lite::operators::UpdatePadding(&paddings,
+                                 global_pooling,
+                                 adaptive,
+                                 padding_algorithm,
+                                 x->dims(),
+                                 strides,
+                                 ksize);
+
+  // Add x tensor type
+  float x_scale = 1.0f;
+  float out_scale = 1.0f;
+  if (op_info->HasAttr("enable_int8")) {
+    if (op_info->GetAttr<bool>("enable_int8")) {
+      if (op_info->HasAttr("input_scale"))
+        x_scale = op_info->GetAttr<float>("input_scale");
+      if (op_info->HasAttr("output_scale"))
+        out_scale = op_info->GetAttr<float>("output_scale");
+    } else {
+      LOG(WARNING) << "Do not enable_int8";
+      return FAILED;
+    }
+  } else {
+    LOG(WARNING) << "Do not enable_int8";
+    return FAILED;
+  }
+
+  NeuronOperandType xType;
+  xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  xType.scale = x_scale;
+  xType.zeroPoint = 128;
+  xType.dimensionCount = x_dims.size();
+  std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
+                                  (uint32_t)x_dims[2],
+                                  (uint32_t)x_dims[3],
+                                  (uint32_t)x_dims[1]};
+  xType.dimensions = &dims_x[0];
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    LOG(INFO) << "Graph has " << x_name;
+    // input operand already exist
+    x_node = graph->Get(x_name);
+  } else {
+    // add input operand
+    (*neuron_model_addOperand)(model, &xType);  // 0: x
+    x_node = graph->Add(x_name, dims_x);
+  }
+  VLOG(3) << "x_scale: " << x_scale << ", xType: " << xType.dimensions[0] << ":"
+          << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
+          << xType.dimensions[3];
+
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {0};
+
+  std::shared_ptr<Node> paddingL_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 1: padding left
+  paddingL_node = graph->Add(x_name + "_padding_left", dims_int32);
+
+  std::shared_ptr<Node> paddingR_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 2: padding right
+  paddingR_node = graph->Add(x_name + "_padding_right", dims_int32);
+
+  std::shared_ptr<Node> paddingT_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 3: padding top
+  paddingT_node = graph->Add(x_name + "_padding_top", dims_int32);
+
+  std::shared_ptr<Node> paddingB_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 4: padding bottom
+  paddingB_node = graph->Add(x_name + "_padding_bottom", dims_int32);
+
+  std::shared_ptr<Node> strideW_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 5: stride width
+  strideW_node = graph->Add(x_name + "_stride_width", dims_int32);
+
+  std::shared_ptr<Node> strideH_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 6: stride height
+  strideH_node = graph->Add(x_name + "_stride_height", dims_int32);
+
+  std::shared_ptr<Node> filterW_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 7: filter width
+  filterW_node = graph->Add(x_name + "_filter_width", dims_int32);
+
+  std::shared_ptr<Node> filterH_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 8: filter height
+  filterH_node = graph->Add(x_name + "_filter_height", dims_int32);
+
+  std::shared_ptr<Node> fuse_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 9: fuse
+  fuse_node = graph->Add(x_name + "_fuse", dims_int32);
+
+  // Add out type
+  // Add output tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = out_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = out_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[2],
+                                    (uint32_t)out_dims[3],
+                                    (uint32_t)out_dims[1]};
+  outType.dimensions = &dims_out[0];
+  std::shared_ptr<Node> out_node = nullptr;
+  if (graph->Has(out_name)) {
+    out_node = graph->Get(out_name);
+  } else {
+    (*neuron_model_addOperand)(model, &outType);  // out
+    out_node = graph->Add(out_name, dims_out);
+  }
+  VLOG(3) << "output_scale: " << x_scale
+          << ", outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Add padding value
+  int32_t padding_val[1];
+  padding_val[0] = paddings[2];
+  (*neuron_model_setOperandValue)(
+      model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[3];
+  (*neuron_model_setOperandValue)(
+      model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[0];
+  (*neuron_model_setOperandValue)(
+      model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[1];
+  (*neuron_model_setOperandValue)(
+      model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1);
+
+  // Add Stride
+  int32_t stride_val[1];
+  stride_val[0] = strides[1];  // width
+  (*neuron_model_setOperandValue)(
+      model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
+  stride_val[0] = strides[0];  // height
+  (*neuron_model_setOperandValue)(
+      model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
+
+  // Add filter
+  int32_t filter_val[1];
+  filter_val[0] = global_pooling ? x_dims[3] : ksize[1];  // width
+  (*neuron_model_setOperandValue)(
+      model, filterW_node->index(), filter_val, sizeof(int32_t) * 1);
+  filter_val[0] = global_pooling ? x_dims[2] : ksize[0];  // height
+  (*neuron_model_setOperandValue)(
+      model, filterH_node->index(), filter_val, sizeof(int32_t) * 1);
+
+  // Add fuse
+  int32_t fuse_val[1] = {0};
+  (*neuron_model_setOperandValue)(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+
+  std::vector<uint32_t> addInIndex = {x_node->index(),
+                                      paddingL_node->index(),
+                                      paddingR_node->index(),
+                                      paddingT_node->index(),
+                                      paddingB_node->index(),
+                                      strideW_node->index(),
+                                      strideH_node->index(),
+                                      filterW_node->index(),
+                                      filterH_node->index(),
+                                      fuse_node->index()};
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+
+  int neuron_errCode;
+  if (pooling_type == "max") {
+    neuron_errCode = (*neuron_model_addOperation)(model,
+                                                  NEURON_MAX_POOL_2D,
+                                                  addInIndex.size(),
+                                                  &addInIndex[0],
+                                                  addOutIndex.size(),
+                                                  &addOutIndex[0]);
+  } else {
+    neuron_errCode = (*neuron_model_addOperation)(model,
+                                                  NEURON_AVERAGE_POOL_2D,
+                                                  addInIndex.size(),
+                                                  &addInIndex[0],
+                                                  addOutIndex.size(),
+                                                  &addOutIndex[0]);
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kAPU,
+                         paddle::lite::subgraph::apu::PoolConverter);
diff --git a/lite/kernels/apu/bridges/softmax_op.cc b/lite/kernels/apu/bridges/softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59fa8fdfe32c85bfaea5825c82b4752632fd8bed
--- /dev/null
+++ b/lite/kernels/apu/bridges/softmax_op.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting [" + op_type + "]";
+
+  auto libHandle = graph->libHandle();
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand)
+  LOAD_FUNCTIONS(
+      libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue)
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation)
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  CHECK_GE(x_dims.size(), 2UL);
+  auto x_rank = x_dims.size();
+  auto out_name = op_info->Output("Out").front();
+
+  // Check output shape
+  auto axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis += x_rank;
+  }
+
+  float input_scale = 1.0f;
+  float out_scale = 1.0f;
+  if (op_info->HasAttr("enable_int8")) {
+    if (op_info->GetAttr<bool>("enable_int8")) {
+      if (op_info->HasAttr("input_scale"))
+        input_scale = op_info->GetAttr<float>("input_scale");
+      if (op_info->HasAttr("output_scale"))
+        out_scale = op_info->GetAttr<float>("output_scale");
+    } else {
+      LOG(WARNING) << "Do not enable_int8";
+      return FAILED;
+    }
+  } else {
+    LOG(WARNING) << "Do not enable_int8";
+    return FAILED;
+  }
+
+  // Check output scale
+  NeuronOperandType xType;
+  xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  xType.scale = input_scale;
+  xType.zeroPoint = 128;
+  xType.dimensionCount = x_dims.size();
+  std::vector<uint32_t> dims_x;
+  for (int i = 0; i < x_dims.size(); i++) dims_x.push_back(x_dims[i]);
+  xType.dimensions = &dims_x[0];
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    // input operand already exist
+    x_node = graph->Get(x_name);
+    VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index();
+  } else {
+    // add input operand
+    (*neuron_model_addOperand)(model, &xType);  // 0: input
+    x_node = graph->Add(x_name, dims_x);
+  }
+  VLOG(3) << "input_scale size: " << input_scale
+          << " ,x_dims size: " << x_dims.size() << " ,x_dims: " << x_dims;
+
+  // Add beta operand
+  std::vector<uint32_t> dims_int32 = {0};
+  NeuronOperandType betaType;
+  betaType.type = NEURON_FLOAT32;
+  betaType.dimensionCount = 0;
+  (*neuron_model_addOperand)(model, &betaType);  // 1: beta
+  std::shared_ptr<Node> beta_node = nullptr;
+  beta_node = graph->Add(x_name + "_beta", dims_int32);
+
+  // Add axis operand
+  NeuronOperandType axisType;
+  axisType.type = NEURON_INT32;
+  axisType.dimensionCount = 0;
+  (*neuron_model_addOperand)(model, &axisType);  // 2: axis
+  std::shared_ptr<Node> axis_node = nullptr;
+  axis_node = graph->Add(x_name + "_axis", dims_int32);
+
+  // Add out operand
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = out_scale / 127;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = x_dims.size();
+  outType.dimensions = &dims_x[0];
+  (*neuron_model_addOperand)(model, &outType);  // 3: output
+  std::shared_ptr<Node> out_node = nullptr;
+  out_node = graph->Add(out_name, dims_x);
+  VLOG(3) << "output_scale: " << out_scale;
+
+  float beta_val[] = {1.0f};
+  (*neuron_model_setOperandValue)(
+      model, beta_node->index(), beta_val, sizeof(float) * 1);
+
+  int32_t axis_val[1];
+  axis_val[0] = axis;
+  (*neuron_model_setOperandValue)(
+      model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
+  std::vector<uint32_t> addInIndex = {
+      x_node->index(), beta_node->index(), axis_node->index()};
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  int neuron_errCode = (*neuron_model_addOperation)(model,
+                                                    NEURON_SOFTMAX,
+                                                    addInIndex.size(),
+                                                    &addInIndex[0],
+                                                    addOutIndex.size(),
+                                                    &addOutIndex[0]);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return FAILED;
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kAPU,
+                         paddle::lite::subgraph::apu::SoftmaxConverter);
diff --git a/lite/kernels/apu/bridges/utility.cc b/lite/kernels/apu/bridges/utility.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eab4d008e57b152e25a131a553fc7cee4f1d7e39
--- /dev/null
+++ b/lite/kernels/apu/bridges/utility.cc
@@ -0,0 +1,257 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/utility.h"
+#include <utility>
+#include "lite/kernels/apu/bridges/graph.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+// typedef to the build functions pointer signatures
+typedef int (*Neuron_getVersion)(uint32_t* version);
+typedef int (*NeuronModel_create)(NeuronModel** model);
+typedef void (*NeuronModel_free)(NeuronModel* model);
+typedef int (*NeuronModel_finish)(NeuronModel* model);
+typedef int (*NeuronModel_addOperand)(NeuronModel* model,
+                                      const NeuronOperandType* type);
+typedef int (*NeuronModel_setOperandValue)(NeuronModel* model,
+                                           int32_t index,
+                                           const void* buffer,
+                                           size_t length);
+typedef int (*NeuronModel_addOperation)(NeuronModel* model,
+                                        NeuronOperationType type,
+                                        uint32_t inputCount,
+                                        const uint32_t* inputs,
+                                        uint32_t outputCount,
+                                        const uint32_t* outputs);
+typedef int (*NeuronModel_identifyInputsAndOutputs)(NeuronModel* model,
+                                                    uint32_t inputCount,
+                                                    const uint32_t* inputs,
+                                                    uint32_t outputCount,
+                                                    const uint32_t* outputs);
+typedef int (*NeuronModel_setOperandSymmPerChannelQuantParams)(
+    NeuronModel* model,
+    int32_t index,
+    const NeuronSymmPerChannelQuantParams* channelQuant);
+typedef int (*NeuronExecution_create)(NeuronCompilation* compilation,
+                                      NeuronExecution** execution);
+typedef void (*NeuronExecution_free)(NeuronExecution* execution);
+typedef int (*NeuronExecution_setInput)(NeuronExecution* execution,
+                                        int32_t index,
+                                        const NeuronOperandType* type,
+                                        const void* buffer,
+                                        size_t length);
+typedef int (*NeuronExecution_setOutput)(NeuronExecution* execution,
+                                         int32_t index,
+                                         const NeuronOperandType* type,
+                                         void* buffer,
+                                         size_t length);
+typedef int (*NeuronExecution_compute)(NeuronExecution* execution);
+
+void* LoadFunc(void* libHandle, const char* name) {
+  CHECK(libHandle != nullptr);
+  CHECK(name != nullptr);
+  void* fn = dlsym(libHandle, name);
+  if (fn == nullptr) {
+    LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
+                 << "] Because " << dlerror();
+  }
+  return fn;
+}
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+
+void insert_transpose_node(void* ctx,
+                           const std::string& input_name,
+                           const std::string& output_name,
+                           std::vector<uint32_t> input_shape,
+                           std::vector<uint32_t> output_shape,
+                           std::vector<int32_t> axis,
+                           float scale,
+                           int32_t zeroPoint) {
+  int neuron_errCode;
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto libHandle = graph->libHandle();
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand)
+  LOAD_FUNCTIONS(
+      libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue)
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation)
+
+  // Add input
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = scale;
+  inType.zeroPoint = zeroPoint;
+  inType.dimensionCount = input_shape.size();
+  inType.dimensions = &input_shape[0];
+
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    VLOG(3) << "Has " << input_name;
+    input_node = graph->Get(input_name);
+  } else {
+    neuron_errCode = (*neuron_model_addOperand)(model, &inType);  // input
+    if (NEURON_NO_ERROR != neuron_errCode) {
+      LOG(WARNING) << "Insert transpose op fail!";
+      return;
+    }
+    VLOG(3) << "Add " << input_name;
+    input_node = graph->Add(input_name, input_shape);
+  }
+
+  // Add perm
+  NeuronOperandType permsType;
+  permsType.type = NEURON_TENSOR_INT32;
+  permsType.dimensionCount = 1;
+  uint32_t dims_perms[1] = {4};
+  permsType.dimensions = dims_perms;
+
+  neuron_errCode = (*neuron_model_addOperand)(model, &permsType);  // perm
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Insert transpose op fail!";
+    return;
+  }
+  std::shared_ptr<Node> perms_node = nullptr;
+  perms_node = graph->Add(input_name + "_perms", {4});
+
+  VLOG(3) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":"
+          << axis[3];
+  //  &axis[0], sizeof(int32_t) * axis.size());
+  neuron_errCode = (*neuron_model_setOperandValue)(
+      model, perms_node->index(), &axis[0], sizeof(int32_t) * axis.size());
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Insert transpose op fail!";
+    return;
+  }
+
+  // Add output
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = scale;
+  outType.zeroPoint = zeroPoint;
+  outType.dimensionCount = output_shape.size();
+  outType.dimensions = &output_shape[0];
+
+  (*neuron_model_addOperand)(model, &outType);  // output
+  std::shared_ptr<Node> output_node = nullptr;
+  output_node = graph->Add(output_name, output_shape);
+
+  std::vector<uint32_t> addInIndex = {input_node->index(),   // 0: input
+                                      perms_node->index()};  // 1: perm
+
+  std::vector<uint32_t> addOutIndex = {output_node->index()};
+
+  neuron_errCode = (*neuron_model_addOperation)(model,
+                                                NEURON_TRANSPOSE,
+                                                addInIndex.size(),
+                                                &addInIndex[0],
+                                                addOutIndex.size(),
+                                                &addOutIndex[0]);
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Insert transpose op fail!";
+  }
+}
+
+void transpose(const int8_t* input_data,
+               uint8_t* output_data,
+               std::vector<uint32_t> input_shape,
+               std::vector<uint32_t> axis) {
+  int old_index = -1;
+  int new_index = -1;
+  int dim[4] = {0};
+  std::vector<uint32_t> shape = input_shape;
+  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+          << ":" << input_shape[3];
+  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
+    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
+      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
+        for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) {
+          old_index = dim[0] * shape[1] * shape[2] * shape[3] +
+                      dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3];
+          new_index =
+              dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
+
+          output_data[new_index] = input_data[old_index];
+        }
+      }
+    }
+  }
+}
+
+void transposeAsym(const int8_t* input_data,
+                   uint8_t* output_data,
+                   std::vector<uint32_t> input_shape,
+                   std::vector<uint32_t> axis) {
+  int old_index = -1;
+  int new_index = -1;
+  int dim[4] = {0};
+  std::vector<uint32_t> shape = input_shape;
+  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+          << ":" << input_shape[3];
+  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
+    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
+      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
+        for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) {
+          old_index = dim[0] * shape[1] * shape[2] * shape[3] +
+                      dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3];
+          new_index =
+              dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
+
+          output_data[new_index] = input_data[old_index] + 128;  // per layer
+        }
+      }
+    }
+  }
+}
+
+void float2int32(const float* bias_data,
+                 float input_scale,
+                 std::vector<float> weight_scale,
+                 int32_t* int32_bias_data) {
+  for (int i = 0; i < weight_scale.size(); i++) {
+    int32_bias_data[i] = bias_data[i] / (input_scale * weight_scale[i]);
+  }
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/apu/bridges/utility.h b/lite/kernels/apu/bridges/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..da3f3cd1835a85f3f9d8f4aa3288bd9eebb39ad8
--- /dev/null
+++ b/lite/kernels/apu/bridges/utility.h
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <dlfcn.h>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "NeuronAdapter.h"
+#include "lite/core/op_lite.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+// typedef to the build functions pointer signatures
+typedef int (*Neuron_getVersion)(uint32_t* version);
+typedef int (*NeuronModel_create)(NeuronModel** model);
+typedef void (*NeuronModel_free)(NeuronModel* model);
+typedef int (*NeuronModel_finish)(NeuronModel* model);
+typedef int (*NeuronModel_addOperand)(NeuronModel* model,
+                                      const NeuronOperandType* type);
+typedef int (*NeuronModel_setOperandValue)(NeuronModel* model,
+                                           int32_t index,
+                                           const void* buffer,
+                                           size_t length);
+typedef int (*NeuronModel_addOperation)(NeuronModel* model,
+                                        NeuronOperationType type,
+                                        uint32_t inputCount,
+                                        const uint32_t* inputs,
+                                        uint32_t outputCount,
+                                        const uint32_t* outputs);
+typedef int (*NeuronModel_identifyInputsAndOutputs)(NeuronModel* model,
+                                                    uint32_t inputCount,
+                                                    const uint32_t* inputs,
+                                                    uint32_t outputCount,
+                                                    const uint32_t* outputs);
+typedef int (*NeuronModel_setOperandSymmPerChannelQuantParams)(
+    NeuronModel* model,
+    int32_t index,
+    const NeuronSymmPerChannelQuantParams* channelQuant);
+typedef int (*NeuronExecution_create)(NeuronCompilation* compilation,
+                                      NeuronExecution** execution);
+typedef void (*NeuronExecution_free)(NeuronExecution* execution);
+typedef int (*NeuronExecution_setInput)(NeuronExecution* execution,
+                                        int32_t index,
+                                        const NeuronOperandType* type,
+                                        const void* buffer,
+                                        size_t length);
+typedef int (*NeuronExecution_setOutput)(NeuronExecution* execution,
+                                         int32_t index,
+                                         const NeuronOperandType* type,
+                                         void* buffer,
+                                         size_t length);
+typedef int (*NeuronExecution_compute)(NeuronExecution* execution);
+
+void* LoadFunc(void* libHandle, const char* name);
+
+#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
+  FUNC_NAME VARIABLE_NAME =                                 \
+      reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
+
+// Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+void insert_transpose_node(void* ctx,
+                           const std::string& input_name,
+                           const std::string& output_name,
+                           std::vector<uint32_t> input_shape,
+                           std::vector<uint32_t> output_shape,
+                           std::vector<int32_t> axis,
+                           float scale,
+                           int32_t zeroPoint);
+
+void transpose(const int8_t* input_data,
+               uint8_t* output_data,
+               std::vector<uint32_t> input_shape,
+               std::vector<uint32_t> axis);
+
+void transposeAsym(const int8_t* input_data,
+                   uint8_t* output_data,
+                   std::vector<uint32_t> input_shape,
+                   std::vector<uint32_t> axis);
+
+void float2int32(const float* bias_data,
+                 float input_scale,
+                 std::vector<float> weight_scale,
+                 int32_t* int32_bias_data);
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a88b7f8c84fa3daec403373acee69dd84d60498
--- /dev/null
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -0,0 +1,297 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/subgraph_compute.h"
+#include <dlfcn.h>
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/backends/apu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/paddle_use_bridges.h"
+#include "lite/kernels/apu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace apu {
+
+inline void* LoadFunc(void* libHandle, const char* name) {
+  CHECK(libHandle != nullptr);
+  CHECK(name != nullptr);
+  void* fn = dlsym(libHandle, name);
+  if (fn == nullptr) {
+    LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
+                 << "] Because " << dlerror();
+  }
+  return fn;
+}
+
+#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
+  FUNC_NAME VARIABLE_NAME =                                 \
+      reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
+
+int SubgraphEngine::BuildDeviceProgram() {
+  typedef int (*Neuron_getVersion)(uint32_t * version);
+  typedef int (*NeuronModel_create)(NeuronModel * *model);
+  typedef void (*NeuronModel_free)(NeuronModel * model);
+  typedef int (*NeuronModel_finish)(NeuronModel * model);
+  typedef int (*NeuronModel_identifyInputsAndOutputs)(NeuronModel * model,
+                                                      uint32_t inputCount,
+                                                      const uint32_t* inputs,
+                                                      uint32_t outputCount,
+                                                      const uint32_t* outputs);
+
+  // Open the share library
+  libHandle_ = dlopen("libneuron_adapter.so", RTLD_LAZY);
+  if (libHandle_ == nullptr) {
+    LOG(WARNING) << "Failed to open libneuron_adapter.so. " << dlerror();
+    return subgraph::FAILED;
+  }
+
+  LOAD_FUNCTIONS(libHandle_, Neuron_getVersion, neuron_getVersion)
+  LOAD_FUNCTIONS(libHandle_, NeuronModel_create, neuron_model_create)
+  LOAD_FUNCTIONS(libHandle_, NeuronModel_finish, neuron_model_finish)
+  LOAD_FUNCTIONS(libHandle_,
+                 NeuronModel_identifyInputsAndOutputs,
+                 neuron_model_identifyInputsAndOutputs)
+
+  unsigned int version;
+  (*neuron_getVersion)(&version);
+  VLOG(3) << "Neuron Adapter version: " << version;
+
+  int status = 0;
+  subgraph::apu::Graph graph;
+  int neuron_errCode = (*neuron_model_create)(&model_);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Fail to create model";
+    return subgraph::FAILED;
+  }
+  graph.set_libHandle(libHandle_);
+  graph.set_model(model_);
+  graph.set_input_names(input_names_);
+  graph.set_output_names(output_names_);
+
+  // Convert all of ops and their input vars and weights and added into the APU
+  // NIR graph
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = const_cast<OpLite*>(inst.op());
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kAPU))) {
+      return subgraph::FAILED;
+    }
+
+    auto kernel = inst.kernel();
+    status |=
+        bridges.Select(op_type, TARGET(kAPU))(reinterpret_cast<void*>(&graph),
+                                              const_cast<OpLite*>(op),
+                                              const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+
+  // Get input tensor
+  std::vector<uint32_t> ins;
+  origin_itensors_.resize(input_names_.size());
+  origin_idims_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+    VLOG(3) << "subgraph input name: " << i << ", " << input_names_[i] << ":"
+            << origin_idims_[i].production();
+    // Get input index
+    int idx;
+    if (graph.Has(input_names_[i])) {
+      ins.push_back(graph.Get(input_names_[i])->index());
+      VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index();
+    } else {
+      LOG(WARNING) << "Fail to find input: " << input_names_[i];
+      return subgraph::FAILED;
+    }
+  }
+
+  // Get output tensor
+  std::vector<uint32_t> outs;
+  origin_otensors_.resize(output_names_.size());
+  origin_odims_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    VLOG(3) << "subgraph output name: " << i << ", " << output_names_[i] << ":"
+            << origin_odims_[i].production();
+    origin_otensors_[i]->mutable_data<int8_t>();
+    // Get input index
+    if (graph.Has(output_names_[i])) {
+      outs.push_back(graph.Get(output_names_[i])->index());
+      VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index();
+    } else {
+      LOG(WARNING) << "Fail to find output: " << output_names_[i];
+      return subgraph::FAILED;
+    }
+  }
+
+  VLOG(3) << "ins size: " << ins.size() << " outs size:" << outs.size();
+  // Set subgraph input/output
+  (*neuron_model_identifyInputsAndOutputs)(
+      model_, ins.size(), &ins[0], outs.size(), &outs[0]);
+  neuron_errCode = (*neuron_model_finish)(model_);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+  VLOG(3) << "[APU] APU NIR model created!";
+
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  auto start_time = GetCurrentUS();
+  compilation_ = lite::apu::Device::Global().Build(libHandle_, model_);
+  if (compilation_ == nullptr) {
+    LOG(WARNING) << "[APU] Build APU DLA model failed!";
+    return subgraph::FAILED;
+  }
+  VLOG(3) << "[APU] APU DLA model created, Build cost "
+          << GetCurrentUS() - start_time << " us";
+
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  typedef int (*NeuronExecution_create)(NeuronCompilation * compilation,
+                                        NeuronExecution * *execution);
+  typedef void (*NeuronExecution_free)(NeuronExecution * execution);
+  typedef int (*NeuronExecution_setInput)(NeuronExecution * execution,
+                                          int32_t index,
+                                          const NeuronOperandType* type,
+                                          const void* buffer,
+                                          size_t length);
+  typedef int (*NeuronExecution_setOutput)(NeuronExecution * execution,
+                                           int32_t index,
+                                           const NeuronOperandType* type,
+                                           void* buffer,
+                                           size_t length);
+  typedef int (*NeuronExecution_compute)(NeuronExecution * execution);
+
+  LOAD_FUNCTIONS(libHandle_, NeuronExecution_create, neuron_execution_create)
+  LOAD_FUNCTIONS(libHandle_, NeuronExecution_free, neuron_execution_free)
+  LOAD_FUNCTIONS(
+      libHandle_, NeuronExecution_setInput, neuron_execution_setInput)
+  LOAD_FUNCTIONS(
+      libHandle_, NeuronExecution_setOutput, neuron_execution_setOutput)
+  LOAD_FUNCTIONS(libHandle_, NeuronExecution_compute, neuron_execution_compute)
+
+  NeuronExecution* run1 = NULL;
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+
+  auto start_time = GetCurrentUS();
+  int neuron_errCode = (*neuron_execution_create)(compilation_, &run1);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "[APU] Build APU runtime failed!";
+    return subgraph::FAILED;
+  }
+
+  // Set input buffer
+  Tensor input_temp;
+  for (size_t i = 0; i < origin_itensors_.size(); i++) {
+    input_temp.Resize({origin_idims_[i]});
+    uint8_t* input_data = input_temp.mutable_data<uint8_t>();
+    memcpy(input_data,
+           origin_itensors_[i]->raw_data(),
+           origin_itensors_[i]->memory_size());
+    for (int j = 0; j < origin_itensors_[i]->data_size(); j++) {
+      input_data[j] += (uint8_t)128;
+    }
+    (*neuron_execution_setInput)(
+        run1, i, NULL, input_data, origin_itensors_[i]->memory_size());
+  }
+
+  // Set output buffer
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
+    (*neuron_execution_setOutput)(
+        run1,
+        i,
+        NULL,
+        reinterpret_cast<void*>(origin_otensors_[i]->raw_data()),
+        origin_otensors_[i]->memory_size());
+  }
+
+  neuron_errCode = (*neuron_execution_compute)(run1);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Fail to run execution!" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
+    int8_t* output_data = origin_otensors_[i]->mutable_data<int8_t>();
+    VLOG(3) << "output size:" << origin_otensors_[i]->memory_size();
+    for (int j = 0; j < origin_otensors_[i]->data_size(); j++) {
+      output_data[j] -= (int8_t)128;
+    }
+  }
+  (*neuron_execution_free)(run1);
+  VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
+  return 0;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace apu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kAPU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::apu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/apu/subgraph_compute.h b/lite/kernels/apu/subgraph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb8743e92914e1fb5752ae930da83ec9761c83a5
--- /dev/null
+++ b/lite/kernels/apu/subgraph_compute.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "NeuronAdapter.h"
+#include "lite/core/kernel.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace apu {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+
+  std::string model_name_;
+  void *libHandle_;
+  NeuronModel *model_;
+  NeuronCompilation *compilation_;
+};
+
+class SubgraphCompute
+    : public KernelLite<TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace apu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index a3b1c3680e283a4425fe22209c443ce7cd958267..aa3a52e8ad1223451de06e820da7e1febb43b879 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -63,7 +63,6 @@ add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps
 add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -92,7 +91,6 @@ add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_
 add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -101,7 +99,6 @@ add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${
 add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 # 4. training kernels
diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc
index ea60cf528ea71f0bc0ba0a162063bd76899622f9..085e914c6e05c26d3031a4cfdac3c39d31f40f6d 100644
--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
@@ -207,6 +207,16 @@ void ReciprocalCompute::Run() {
       x_data, output_data, x_dims.production(), ctx.threads());
 }
 
+void AbsCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_abs<float>(
+      x_data, output_data, x_dims.production(), ctx.threads());
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -321,3 +331,8 @@ REGISTER_LITE_KERNEL(reciprocal,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+REGISTER_LITE_KERNEL(
+    abs, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AbsCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h
index 2e8deda786a1ea9af70499c7b33c8aa1c6e19370..2e9774637b7a9156197ffeff5f4bca13a20620bb 100644
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
@@ -166,6 +166,15 @@ class ReciprocalCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
   virtual ~ReciprocalCompute() = default;
 };
 
+class AbsCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~AbsCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/arm/compare_compute.cc b/lite/kernels/arm/compare_compute.cc
deleted file mode 100644
index 709942a0d9f385e4ba55be32657633c0edc378cf..0000000000000000000000000000000000000000
--- a/lite/kernels/arm/compare_compute.cc
+++ /dev/null
@@ -1,295 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/compare_compute.h"
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-#define COMPARE_FUNCTOR(name, op)                                           \
-  template <typename T>                                                     \
-  struct _##name##Functor {                                                 \
-    inline bool operator()(const T &a, const T &b) const { return a op b; } \
-  };
-
-COMPARE_FUNCTOR(Equal, ==);
-COMPARE_FUNCTOR(NotEqual, !=);
-COMPARE_FUNCTOR(LessThan, <);
-COMPARE_FUNCTOR(LessEqual, <=);
-COMPARE_FUNCTOR(GreaterThan, >);
-COMPARE_FUNCTOR(GreaterEqual, >=);
-
-template <>
-struct _EqualFunctor<float> {
-  inline bool operator()(const float &a, const float &b) const {
-    // It is safe to cast a and b to double.
-    return fabs(static_cast<double>(a - b)) < 1e-8;
-  }
-};
-
-template <>
-struct _NotEqualFunctor<float> {
-  inline bool operator()(const float &a, const float &b) const {
-    return !_EqualFunctor<float>()(a, b);
-  }
-};
-
-inline void get_mid_dims(const lite::DDim &x_dims,
-                         const lite::DDim &y_dims,
-                         const int axis,
-                         int *pre,
-                         int *n,
-                         int *post) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    (*n) *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    (*post) *= x_dims[i];
-  }
-}
-
-template <template <typename T> class Functor>
-void CompareCompute<Functor>::Run() {
-  auto &param = this->Param<operators::CompareParam>();
-
-  using CompareFunctor = Functor<float>;
-
-  const size_t x_size = param.X->numel();
-  const size_t y_size = param.Y->numel();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  bool *z = param.Out->template mutable_data<bool>();
-  const auto *x = param.X->template data<float>();
-  const auto *y = param.Y->template data<float>();
-  auto axis = param.axis;
-  bool force_cpu = param.force_cpu;
-  if (x_size == y_size) {
-    for (int i = 0; i < x_size; ++i) {
-      z[i] = CompareFunctor()(x[i], y[i]);
-    }
-  } else {
-    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
-    int outer_num, mid_num, inner_num;
-    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
-    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
-      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
-        auto y_data = y[mid_id];
-        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
-          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
-          z[index] = CompareFunctor()(x[index], y_data);
-          // z[index] = x[index] < y_data;
-        }
-      }
-    }
-  }
-}
-
-template <template <typename T> class Functor>
-void CompareCompute_int32<Functor>::Run() {
-  auto &param = this->Param<operators::CompareParam>();
-
-  using CompareFunctor = Functor<int>;
-
-  const size_t x_size = param.X->numel();
-  const size_t y_size = param.Y->numel();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  bool *z = param.Out->template mutable_data<bool>();
-  const auto *x = param.X->template data<int>();
-  const auto *y = param.Y->template data<int>();
-  auto axis = param.axis;
-  bool force_cpu = param.force_cpu;
-  if (x_size == y_size) {
-    for (int i = 0; i < x_size; ++i) {
-      z[i] = CompareFunctor()(x[i], y[i]);
-    }
-  } else {
-    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
-    int outer_num, mid_num, inner_num;
-    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
-    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
-      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
-        auto y_data = y[mid_id];
-        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
-          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
-          z[index] = CompareFunctor()(x[index], y_data);
-          // z[index] = x[index] < y_data;
-        }
-      }
-    }
-  }
-}
-
-template <template <typename T> class Functor>
-void CompareCompute_int64<Functor>::Run() {
-  auto &param = this->Param<operators::CompareParam>();
-
-  using CompareFunctor = Functor<int64_t>;
-
-  const size_t x_size = param.X->numel();
-  const size_t y_size = param.Y->numel();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  bool *z = param.Out->template mutable_data<bool>();
-  const auto *x = param.X->template data<int64_t>();
-  const auto *y = param.Y->template data<int64_t>();
-  auto axis = param.axis;
-  bool force_cpu = param.force_cpu;
-  if (x_size == y_size) {
-    for (int i = 0; i < x_size; ++i) {
-      z[i] = CompareFunctor()(x[i], y[i]);
-    }
-  } else {
-    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
-    int outer_num, mid_num, inner_num;
-    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
-    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
-      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
-        auto y_data = y[mid_id];
-        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
-          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
-          z[index] = CompareFunctor()(x[index], y_data);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_EqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(equal,
-                     kARM,
-                     kInt32,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int32<
-                         paddle::lite::kernels::arm::_EqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(not_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_NotEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_than,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_than,
-                     kARM,
-                     kInt32,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int32<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_than,
-                     kARM,
-                     kInt64,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int64<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_LessEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(greater_than,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_GreaterThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(greater_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_GreaterEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
diff --git a/lite/kernels/arm/concat_compute.cc b/lite/kernels/arm/concat_compute.cc
index eb59affea37eb7e2979f37c860eae9d399b73093..dc78e1b955c29b261b2103479ea00bb836c0a31f 100644
--- a/lite/kernels/arm/concat_compute.cc
+++ b/lite/kernels/arm/concat_compute.cc
@@ -34,40 +34,21 @@ std::vector<size_t> stride_numel(const DDim& ddim) {
   return strides;
 }
 
-void ConcatCompute::Run() {
-  auto& param = Param<operators::ConcatParam>();
-  std::vector<lite::Tensor*> inputs = param.x;
-  auto* out = param.output;
-  int axis = param.axis;
-  auto* axis_tensor = param.axis_tensor;
-  if (axis_tensor != nullptr) {
-    auto* axis_tensor_data = axis_tensor->data<int>();
-    axis = axis_tensor_data[0];
-  }
-  out->mutable_data<float>();
-
-  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
+template <typename T>
+void ConcatFunc(const std::vector<lite::Tensor*> inputs,
+                int axis,
+                lite::Tensor* out) {
+  // Sometimes direct copies will be faster, this maybe need deeply analysis.
   if (axis == 0 && inputs.size() < 10) {
     size_t output_offset = 0;
     for (auto* in : inputs) {
       auto in_stride = stride_numel(in->dims());
       auto out_stride = stride_numel(out->dims());
-      void* dst = out->mutable_data<float>() + output_offset;
-      const void* src = in->data<float>();
-#if 0
-      LOG(INFO) << "out_stride.size():" << out_stride.size();
-      LOG(INFO) << "out_stride[0]" << out_stride[0];
-      for (int i=0; i < out_stride.size(); ++i) {
-        LOG(INFO) << "out_stride[" << i << "]:" << out_stride[i];
-      }
-      LOG(INFO) << "in_stride.size():" << in_stride.size();
-      for (int i=0; i < in_stride.size(); ++i) {
-        LOG(INFO) << "in_stride[" << i << "]:" << in_stride[i];
-      }
-#endif
+      void* dst = out->mutable_data<T>() + output_offset;
+      const void* src = in->data<T>();
       // src and dst tensor should have the same dims size.
       CHECK(in_stride.size() == out_stride.size());
-      std::memcpy(dst, src, sizeof(float) * in_stride[0]);
+      std::memcpy(dst, src, sizeof(T) * in_stride[0]);
       output_offset += in_stride[0];
     }
   } else {
@@ -75,9 +56,37 @@ void ConcatCompute::Run() {
     for (int j = 0; j < inputs.size(); ++j) {
       inputs_concat[j] = inputs[j];
     }
-    lite::arm::math::concat_func(inputs_concat, axis, out);
+    lite::arm::math::concat_func<T>(inputs_concat, axis, out);
+  }
+}
+
+void ConcatCompute::Run() {
+  auto& param = Param<operators::ConcatParam>();
+  std::vector<lite::Tensor*> inputs = param.x;
+  CHECK_GE(inputs.size(), 1);
+  auto* out = param.output;
+  int axis = param.axis;
+  auto* axis_tensor = param.axis_tensor;
+  if (axis_tensor != nullptr) {
+    auto* axis_tensor_data = axis_tensor->data<int>();
+    axis = axis_tensor_data[0];
+  }
+
+  switch (inputs.front()->precision()) {
+    case PRECISION(kFloat):
+      ConcatFunc<float>(inputs, axis, out);
+      break;
+    case PRECISION(kInt32):
+      ConcatFunc<int32_t>(inputs, axis, out);
+      break;
+    case PRECISION(kInt64):
+      ConcatFunc<int64_t>(inputs, axis, out);
+      break;
+    default:
+      LOG(FATAL) << "Concat does not implement for the "
+                 << "input type:"
+                 << static_cast<int>(inputs.front()->precision());
   }
-  return;
 }
 
 }  // namespace arm
@@ -86,9 +95,9 @@ void ConcatCompute::Run() {
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(
-    concat, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    concat, kARM, kAny, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindInput("AxisTensor",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/concat_compute.h b/lite/kernels/arm/concat_compute.h
index d692140420b5ff2436f286c19491f857871eb6c7..7c96279fd2388a26d4ccb7cf3b2bba9d11aa08b4 100644
--- a/lite/kernels/arm/concat_compute.h
+++ b/lite/kernels/arm/concat_compute.h
@@ -22,7 +22,7 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class ConcatCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class ConcatCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::ConcatParam;
 
diff --git a/lite/kernels/arm/concat_compute_test.cc b/lite/kernels/arm/concat_compute_test.cc
index a3131f68924121a44707e290ddbe29cb2b086e4b..44c6dedd44ad4509a3f5a9c13fc04d6f1ffbdc64 100644
--- a/lite/kernels/arm/concat_compute_test.cc
+++ b/lite/kernels/arm/concat_compute_test.cc
@@ -95,7 +95,7 @@ void concat_compute_ref(const operators::ConcatParam& param) {
 
 TEST(concat_arm, init) {
   ConcatCompute concat;
-  ASSERT_EQ(concat.precision(), PRECISION(kFloat));
+  ASSERT_EQ(concat.precision(), PRECISION(kAny));
   ASSERT_EQ(concat.target(), TARGET(kARM));
 }
 
@@ -222,8 +222,7 @@ TEST(concat_arm, compute_input_multi) {
 
 TEST(concat, retrive_op) {
   auto concat =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "concat");
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kAny)>("concat");
   ASSERT_FALSE(concat.empty());
   ASSERT_TRUE(concat.front());
 }
@@ -233,4 +232,4 @@ TEST(concat, retrive_op) {
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kARM, kAny, kNCHW, def);
diff --git a/lite/kernels/arm/gather_compute.cc b/lite/kernels/arm/gather_compute.cc
index af2abbcb2ed26422331235666a98c74923057b8d..3efacc4aacefcb150d53738c950ec9e797ed78c7 100644
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -20,24 +20,48 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void GatherCompute::Run() {
-  auto& param = this->Param<operators::GatherParam>();
-
-  auto* p_output = param.Out->mutable_data<float>();
-  auto index_size = param.Index->dims()[0];
+template <typename T>
+void GatherFunc(const operators::GatherParam& param) {
   auto src_dims = param.X->dims();
-  const float* p_src = param.X->data<float>();
+  auto index_size = param.Index->dims()[0];
+  auto* p_src = param.X->data<T>();
   const int* p_index = param.Index->data<int>();
+  auto* p_output = param.Out->mutable_data<T>();
 
   int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) {
+  for (size_t i = 1; i < src_dims.size(); ++i) {
     slice_size *= src_dims[i];
   }
   for (int i = 0; i < index_size; ++i) {
     int index_ = p_index[i];
     memcpy(p_output + i * slice_size,
            p_src + index_ * slice_size,
-           slice_size * sizeof(float));
+           slice_size * sizeof(T));
+  }
+}
+
+void GatherCompute::Run() {
+  auto& param = this->Param<operators::GatherParam>();
+
+  switch (param.X->precision()) {
+    case PRECISION(kFloat):
+      GatherFunc<float>(param);
+      break;
+    case PRECISION(kInt8):
+      GatherFunc<int8_t>(param);
+      break;
+    case PRECISION(kInt16):
+      GatherFunc<int16_t>(param);
+      break;
+    case PRECISION(kInt32):
+      GatherFunc<int32_t>(param);
+      break;
+    case PRECISION(kInt64):
+      GatherFunc<int64_t>(param);
+      break;
+    default:
+      LOG(FATAL) << "Gather does not implement for the "
+                 << "input type:" << static_cast<int>(param.X->precision());
   }
 }
 
@@ -48,8 +72,8 @@ void GatherCompute::Run() {
 
 REGISTER_LITE_KERNEL(
     gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/matmul_compute.cc b/lite/kernels/arm/matmul_compute.cc
index 2841fa13f7a04026bc9040a8bd9fdc98dd7e149e..d22b14155a981f5fd37f0d7f27ebf422e851f65c 100644
--- a/lite/kernels/arm/matmul_compute.cc
+++ b/lite/kernels/arm/matmul_compute.cc
@@ -45,32 +45,13 @@ void MatMulCompute::Run() {
   operators::ActivationParam act_param;
   act_param.has_active = false;
 
-  if (x_dims.size() > 2 && y_dims.size() >= 2) {
+  if ((x_dims.size() >= 2 && y_dims.size() >= 2) &&
+      (x_dims.size() != 2 || y_dims.size() != 2)) {
     // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
     // x: [B, M, K], y: [K, N], out: [B, M, N]
-
-    if (!x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 2])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    } else if (!x_transpose && y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    } else if (x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 2])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    } else {
-      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    }
-
+    // or
+    // x: [M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [M, K], y: [B, K, N], out: [B, M, N]
     int lda, ldb, ldc;
     if (!x_transpose) {
       m_ = x_dims[x_dims.size() - 2];
@@ -96,11 +77,7 @@ void MatMulCompute::Run() {
     int y_inner = y_dims[y_dims.size() - 2] * y_dims[y_dims.size() - 1];
     int out_inner = o_dims[o_dims.size() - 2] * o_dims[o_dims.size() - 1];
 
-    float* x_data_trans = nullptr;
-    if (x_transpose) {
-      x_data_trans = static_cast<float*>(malloc(sizeof(float) * x_inner));
-    }
-    if (y_dims.size() > 2) {
+    if (x_dims.size() > 2 && y_dims.size() > 2) {
       for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
         lite::arm::math::sgemm(x_transpose,
                                y_transpose,
@@ -120,7 +97,7 @@ void MatMulCompute::Run() {
                                act_param,
                                &ctx);
       }
-    } else {
+    } else if (x_dims.size() > 2 && y_dims.size() == 2) {
       for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
         lite::arm::math::sgemm(x_transpose,
                                y_transpose,
@@ -140,34 +117,29 @@ void MatMulCompute::Run() {
                                act_param,
                                &ctx);
       }
-    }
-    if (x_data_trans) {
-      free(x_data_trans);
+    } else if (x_dims.size() == 2 && y_dims.size() > 2) {
+      for (size_t i = 0; i < y_dims.count(0, y_dims.size() - 2); ++i) {
+        lite::arm::math::sgemm(x_transpose,
+                               y_transpose,
+                               m_,
+                               n_,
+                               k_,
+                               alpha,
+                               x_data,
+                               lda,
+                               y_data + i * y_inner,
+                               ldb,
+                               0.f,
+                               o_data + i * out_inner,
+                               ldc,
+                               nullptr,
+                               false,
+                               act_param,
+                               &ctx);
+      }
     }
   } else if (x_dims.size() == 2 && y_dims.size() == 2) {
     // x: [M, K], y: [K, N], out: [M, N]
-    if (!x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (!x_transpose && y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[0], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else {
-      CHECK_EQ(x_dims[0], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    }
-
     int lda, ldb, ldc;
     if (!x_transpose) {
       m_ = x_dims[0];
diff --git a/lite/kernels/arm/mean_grad_compute.cc b/lite/kernels/arm/mean_grad_compute.cc
index f7a5be8be1ebd4e02a188ab40026de04b319c76e..f72ccf47dba0c0e9d0a4e793f4b582c106cfeecd 100644
--- a/lite/kernels/arm/mean_grad_compute.cc
+++ b/lite/kernels/arm/mean_grad_compute.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/arm/mean_grad_compute.h"
-
+#include "lite/backends/arm/math/reduce_mean.h"
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -31,10 +31,7 @@ void MeanGradCompute::Run() {
 
   int input_grad_size = input_grad->dims().production();
 
-  // TODO(mapingshuo): use parallel methods to accelerate this for loop
-  for (int i = 0; i < input_grad_size; i++) {
-    input_grad_data[i] = out_grad_data[0] / input_grad_size;
-  }
+  lite::arm::math::mean_grad(out_grad_data, input_grad_data, input_grad_size);
 }
 
 }  // namespace arm
diff --git a/lite/kernels/arm/while_compute.cc b/lite/kernels/arm/while_compute.cc
index 00b37b2db9512adfe0d465dcbb9c76af78d32486..9241fd410a542cef797b57b9341f59895b0f734d 100644
--- a/lite/kernels/arm/while_compute.cc
+++ b/lite/kernels/arm/while_compute.cc
@@ -46,9 +46,10 @@ void WhileCompute::Run() {
 
 REGISTER_LITE_KERNEL(
     while, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::WhileCompute, def)
-    .BindInput("X", {LiteType::GetTensorListTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
     .BindInput("Condition",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorListTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
     .BindOutput("StepScopes", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/bm/bridges/CMakeLists.txt b/lite/kernels/bm/bridges/CMakeLists.txt
index e8fdc80950ca617272c339fca15fd1157ea82535..5d9de91361fb4269b6247bbb2195b2ca0a0b1000 100644
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -37,6 +37,7 @@ lite_cc_library(subgraph_bridge_split_op_bm SRCS split_op.cc DEPS ${bm_subgraph_
 lite_cc_library(subgraph_bridge_matmul_op_bm SRCS matmul_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_max_pool_with_index_op_bm SRCS max_pool_with_index_op.cc DEPS ${bm_subgraph_bridge_deps})
 
+
 set(bm_subgraph_bridges
         subgraph_bridge_registry
         subgraph_bridge_engine
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index 94fe384d0414d87f38fb0d1ab3e8ac1033423702..8505a1a1405d7569ad3161812fc552f8625499ef 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -2,7 +2,10 @@ message(STATUS "compile with lite host kernels")
 
 add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
+add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(compare_compute_host Host extra SRCS compare_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${lite_kernel_deps})
diff --git a/lite/kernels/host/compare_compute.cc b/lite/kernels/host/compare_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f44b3edcfcf8690e67d02daf2d05040b56c53296
--- /dev/null
+++ b/lite/kernels/host/compare_compute.cc
@@ -0,0 +1,246 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/compare_compute.h"
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+#define COMPARE_FUNCTOR(name, op)                                           \
+  template <typename T>                                                     \
+  struct _##name##Functor {                                                 \
+    using TYPE = T;                                                         \
+    inline bool operator()(const T &a, const T &b) const { return a op b; } \
+  };
+
+COMPARE_FUNCTOR(Equal, ==);
+COMPARE_FUNCTOR(NotEqual, !=);
+COMPARE_FUNCTOR(LessThan, <);
+COMPARE_FUNCTOR(LessEqual, <=);
+COMPARE_FUNCTOR(GreaterThan, >);
+COMPARE_FUNCTOR(GreaterEqual, >=);
+
+template <>
+struct _EqualFunctor<float> {
+  using TYPE = float;
+  inline bool operator()(const float &a, const float &b) const {
+    // It is safe to cast a and b to double.
+    return fabs(static_cast<double>(a - b)) < 1e-8;
+  }
+};
+
+template <>
+struct _NotEqualFunctor<float> {
+  using TYPE = float;
+  inline bool operator()(const float &a, const float &b) const {
+    return !_EqualFunctor<float>()(a, b);
+  }
+};
+
+inline void get_mid_dims(const lite::DDim &x_dims,
+                         const lite::DDim &y_dims,
+                         const int axis,
+                         int *pre,
+                         int *n,
+                         int *post) {
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+
+  for (int i = 0; i < y_dims.size(); ++i) {
+    (*n) *= y_dims[i];
+  }
+
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+}
+
+template <PrecisionType PType, typename CompareFunctor>
+void CompareCompute<PType, CompareFunctor>::Run() {
+  auto &param = this->template Param<operators::CompareParam>();
+  using DType = typename CompareFunctor::TYPE;
+  const size_t x_size = param.X->numel();
+  const size_t y_size = param.Y->numel();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  bool *z = param.Out->template mutable_data<bool>();
+  const auto *x = param.X->template data<DType>();
+  const auto *y = param.Y->template data<DType>();
+  if (x_size == y_size) {
+    for (int i = 0; i < x_size; ++i) {
+      z[i] = CompareFunctor()(x[i], y[i]);
+    }
+  } else {
+    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
+    int outer_num, mid_num, inner_num;
+    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
+    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
+      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
+        auto y_data = y[mid_id];
+        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
+          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
+          z[index] = CompareFunctor()(x[index], y_data);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_EqualFunctor<float>>;
+REGISTER_LITE_KERNEL(equal, kHost, kFloat, kAny, equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using equal_int32 = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kInt32),
+    paddle::lite::kernels::host::_EqualFunctor<int32_t>>;
+REGISTER_LITE_KERNEL(equal, kHost, kInt32, kAny, equal_int32, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using not_equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_NotEqualFunctor<float>>;
+REGISTER_LITE_KERNEL(not_equal, kHost, kFloat, kAny, not_equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_than_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_LessThanFunctor<float>>;
+REGISTER_LITE_KERNEL(less_than, kHost, kFloat, kAny, less_than_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_than_int32 = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kInt32),
+    paddle::lite::kernels::host::_LessThanFunctor<int32_t>>;
+REGISTER_LITE_KERNEL(less_than, kHost, kInt32, kAny, less_than_int32, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_than_int64 = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kInt64),
+    paddle::lite::kernels::host::_LessThanFunctor<int64_t>>;
+REGISTER_LITE_KERNEL(less_than, kHost, kInt64, kAny, less_than_int64, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_LessEqualFunctor<float>>;
+REGISTER_LITE_KERNEL(less_equal, kHost, kFloat, kAny, less_equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using greater_than_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_GreaterThanFunctor<float>>;
+REGISTER_LITE_KERNEL(greater_than, kHost, kFloat, kAny, greater_than_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using greater_equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_GreaterEqualFunctor<float>>;
+REGISTER_LITE_KERNEL(
+    greater_equal, kHost, kFloat, kAny, greater_equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
diff --git a/lite/kernels/arm/compare_compute.h b/lite/kernels/host/compare_compute.h
similarity index 54%
rename from lite/kernels/arm/compare_compute.h
rename to lite/kernels/host/compare_compute.h
index 278ce1500f473e53f1f1a21a461ff9f786c94ce5..a1166c6c7092955087a1bcf618c287f2c67fdd9a 100644
--- a/lite/kernels/arm/compare_compute.h
+++ b/lite/kernels/host/compare_compute.h
@@ -13,43 +13,24 @@
 // limitations under the License.
 
 #pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
 #include "lite/core/kernel.h"
-#include "lite/operators/compare_op.h"
+#include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
-template <template <typename T> class Functor>
-class CompareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <PrecisionType PType, typename CompareFunctor>
+class CompareCompute
+    : public KernelLite<TARGET(kHost), PType, DATALAYOUT(kAny)> {
  public:
   void Run() override;
 
-  ~CompareCompute() {}
+  virtual ~CompareCompute() = default;
 };
 
-template <template <typename T> class Functor>
-class CompareCompute_int32
-    : public KernelLite<TARGET(kARM), PRECISION(kInt32)> {
- public:
-  void Run() override;
-
-  ~CompareCompute_int32() {}
-};
-
-template <template <typename T> class Functor>
-class CompareCompute_int64
-    : public KernelLite<TARGET(kARM), PRECISION(kInt64)> {
- public:
-  void Run() override;
-
-  ~CompareCompute_int64() {}
-};
-
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/arm/is_empty_compute.cc b/lite/kernels/host/is_empty_compute.cc
similarity index 62%
rename from lite/kernels/arm/is_empty_compute.cc
rename to lite/kernels/host/is_empty_compute.cc
index fac4e34bc2de76d8d77724223d765168fdfc24e6..3d2ba7c796226fbbf82bf020d1072f0bbfb9a394 100644
--- a/lite/kernels/arm/is_empty_compute.cc
+++ b/lite/kernels/host/is_empty_compute.cc
@@ -12,19 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/is_empty_compute.h"
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
+#include "lite/kernels/host/is_empty_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
-
-void IsEmptyCompute::PrepareForRun() {}
+namespace host {
 
 void IsEmptyCompute::Run() {
   auto& param = this->Param<operators::IsEmptyParam>();
@@ -32,16 +25,22 @@ void IsEmptyCompute::Run() {
   param.Out->mutable_data<bool>()[0] = (count == 0);
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 REGISTER_LITE_KERNEL(is_empty,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::IsEmptyCompute,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::IsEmptyCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kBool),
+                                       DATALAYOUT(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/is_empty_compute.h b/lite/kernels/host/is_empty_compute.h
similarity index 75%
rename from lite/kernels/arm/is_empty_compute.h
rename to lite/kernels/host/is_empty_compute.h
index 072c8feac3613ef2f92c017d2125e934ca758be8..77063a415f2f4626870e395505d28f1a34c413ea 100644
--- a/lite/kernels/arm/is_empty_compute.h
+++ b/lite/kernels/host/is_empty_compute.h
@@ -14,27 +14,23 @@
 
 #pragma once
 #include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
 #include "lite/core/kernel.h"
-#include "lite/operators/logical_op.h"
+#include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
-class IsEmptyCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class IsEmptyCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
  public:
-  using param_t = operators::IsEmptyParam;
-
-  void PrepareForRun() override;
-
   void Run() override;
 
   ~IsEmptyCompute() {}
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/host/reshape_compute_test.cc b/lite/kernels/host/reshape_compute_test.cc
deleted file mode 100644
index e09da816469eb3bd8d3505de5cb9dc3d451a527d..0000000000000000000000000000000000000000
--- a/lite/kernels/host/reshape_compute_test.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/host/reshape_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-TEST(reshape_host, init) {
-  ReshapeCompute reshape;
-  ASSERT_EQ(reshape.precision(), PRECISION(kAny));
-  ASSERT_EQ(reshape.target(), TARGET(kHost));
-}
-
-TEST(reshape_host, compute) {
-  ReshapeCompute reshape;
-  operators::ReshapeParam param;
-
-  Tensor input;
-  Tensor output;
-  input.Resize({1, 2, 4, 6});
-  auto* input_data = input.mutable_data<float>();
-  for (int i = 0; i < input.numel(); i++) {
-    input_data[i] = i;
-  }
-  Tensor shape_tensor;
-  shape_tensor.Resize({2});
-  auto* shape_tensor_data = shape_tensor.mutable_data<int>();
-  shape_tensor_data[0] = 6;
-  shape_tensor_data[1] = 8;
-
-  // set param and run
-  param.x = &input;
-  param.shape_tensor = &shape_tensor;  // use shape_tensor
-  param.inplace = false;
-  param.output = &output;
-  reshape.SetParam(param);
-  reshape.Run();
-
-  // check output dims
-  CHECK_EQ(shape_tensor.numel(), output.numel());
-  for (int i = 0; i < output.dims().size(); i++) {
-    CHECK_EQ(output.dims()[i], shape_tensor_data[i]);
-  }
-
-  // check output data
-  auto* output_data = output.mutable_data<float>();
-  CHECK_NE(output_data, input_data);
-  for (int i = 0; i < output.numel(); i++) {
-    EXPECT_NEAR(output_data[i], input_data[i], 1e-6);
-  }
-
-  // use shape, set param and run
-  param.shape_tensor = nullptr;
-  param.shape_vct = {-1, 0, 3, 2, 1};
-  reshape.SetParam(param);
-  reshape.Run();
-
-  // check output dims
-  CHECK_EQ(shape_tensor.numel(), output.numel());
-  for (int i = 0; i < output.dims().size(); i++) {
-    CHECK_EQ(output.dims()[i], shape_tensor_data[i]);
-  }
-
-  // check output data
-  output_data = output.mutable_data<float>();
-  CHECK_NE(output_data, input_data);
-  for (int i = 0; i < output.numel(); i++) {
-    EXPECT_NEAR(output_data[i], input_data[i], 1e-6);
-  }
-
-  // check output data if inplace = true;
-  param.inplace = true;
-  reshape.SetParam(param);
-  reshape.Run();
-  output_data = output.mutable_data<float>();
-  CHECK_EQ(output_data, input_data);
-}
-
-TEST(reshape, retrive_op) {
-  auto reshape =
-      KernelRegistry::Global()
-          .Create<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)>("reshape");
-  ASSERT_FALSE(reshape.empty());
-  ASSERT_TRUE(reshape.front());
-}
-
-TEST(reshape2, retrive_op) {
-  auto reshape2 =
-      KernelRegistry::Global()
-          .Create<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)>("reshape2");
-  ASSERT_FALSE(reshape2.empty());
-  ASSERT_TRUE(reshape2.front());
-}
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
diff --git a/lite/kernels/arm/shape_compute.cc b/lite/kernels/host/shape_compute.cc
similarity index 70%
rename from lite/kernels/arm/shape_compute.cc
rename to lite/kernels/host/shape_compute.cc
index 3928e845023dd10c66704e1d752d2e5d2d7a5aff..83060cf4810447dfef20cf01ec3e1499e47e127b 100644
--- a/lite/kernels/arm/shape_compute.cc
+++ b/lite/kernels/host/shape_compute.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/shape_compute.h"
-#include "lite/backends/arm/math/funcs.h"
+#include "lite/kernels/host/shape_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 void ShapeCompute::Run() {
   auto& param = Param<operators::ShapeParam>();
@@ -29,13 +28,17 @@ void ShapeCompute::Run() {
   }
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(
-    shape, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ShapeCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    shape, kHost, kAny, kAny, paddle::lite::kernels::host::ShapeCompute, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
     .Finalize();
diff --git a/lite/kernels/arm/shape_compute.h b/lite/kernels/host/shape_compute.h
similarity index 87%
rename from lite/kernels/arm/shape_compute.h
rename to lite/kernels/host/shape_compute.h
index 267df75624bf3381dba47c38c3e19bb07d0bb7e9..f11b79cddde8e8c546cff720b5b19cc085a06c3c 100644
--- a/lite/kernels/arm/shape_compute.h
+++ b/lite/kernels/host/shape_compute.h
@@ -19,16 +19,17 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
-class ShapeCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class ShapeCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
  public:
   void Run() override;
 
   virtual ~ShapeCompute() = default;
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/act_op_test.cc b/lite/kernels/mlu/bridges/act_op_test.cc
index 7cec0529e49e694c362b3e0a550948f7855c85a2..2b7747f4d8b647b8cb621876907f6178ebf9fe88 100644
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
@@ -44,40 +44,40 @@ void act_ref(const std::shared_ptr<operators::ActivationOp> op) {
 
   // "sigmoid","relu","tanh","relu_clipped","leaky_relu","softsign","hard_sigmoid"
   if (op_type == "sigmoid") {
-    for (size_t i = 0; i < out->numel(); i++) {
+    for (int i = 0; i < out->numel(); i++) {
       out_data[i] = 1.f / (1.f + std::exp(-x_data[i]));
     }
   } else if (op_type == "relu") {
-    for (size_t i = 0; i < out->numel(); i++) {
+    for (int i = 0; i < out->numel(); i++) {
       out_data[i] = std::max(0.f, x_data[i]);
     }
   } else if (op_type == "tanh") {
-    for (size_t i = 0; i < out->numel(); i++) {
+    for (int i = 0; i < out->numel(); i++) {
       out_data[i] = (std::exp(x_data[i]) - std::exp(-x_data[i])) /
                     (std::exp(x_data[i]) + std::exp(-x_data[i]));
     }
   } else if (op_type == "relu_clipped") {
     auto relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
-    for (size_t i = 0; i < out->numel(); i++) {
+    for (int i = 0; i < out->numel(); i++) {
       out_data[i] = std::min(std::max(0.f, x_data[i]), relu_clipped_coef);
     }
   } else if (op_type == "relu6") {
-    for (size_t i = 0; i < out->numel(); i++) {
+    for (int i = 0; i < out->numel(); i++) {
       out_data[i] = std::min(std::max(0.f, x_data[i]), 6.f);
     }
   } else if (op_type == "leaky_relu") {
     auto alpha = op_info->GetAttr<float>("alpha");
-    for (size_t i = 0; i < out->numel(); i++) {
+    for (int i = 0; i < out->numel(); i++) {
       out_data[i] = std::max(x_data[i], x_data[i] * alpha);
     }
   } else if (op_type == "softsign") {
-    for (size_t i = 0; i < out->numel(); i++) {
+    for (int i = 0; i < out->numel(); i++) {
       out_data[i] = x_data[i] / (1 + std::abs(x_data[i]));
     }
   } else if (op_type == "hard_sigmoid") {
     auto slope = op_info->GetAttr<float>("slope");
     auto offset = op_info->GetAttr<float>("offset");
-    for (size_t i = 0; i < out->numel(); i++) {
+    for (int i = 0; i < out->numel(); i++) {
       out_data[i] = std::min(1.f, slope * x_data[i] + offset);
       out_data[i] = std::max(0.f, out_data[i]);
     }
diff --git a/lite/kernels/mlu/bridges/concat_op_test.cc b/lite/kernels/mlu/bridges/concat_op_test.cc
index c4b48a9ef45430ec5867d231bbc2d0a798ec66d0..1dbcaf7160fd36ab75c4a1139555650b98030482 100644
--- a/lite/kernels/mlu/bridges/concat_op_test.cc
+++ b/lite/kernels/mlu/bridges/concat_op_test.cc
@@ -37,7 +37,7 @@ void concat_ref(const std::shared_ptr<operators::ConcatOpLite> op) {
       scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
   int axis = op_info->GetAttr<int>("axis");
   std::vector<lite::Tensor*> inputs_concat(inputs.size());
-  for (int j = 0; j < inputs.size(); ++j) {
+  for (size_t j = 0; j < inputs.size(); ++j) {
     inputs_concat[j] = inputs[j];
   }
   size_t num = inputs.size();
@@ -48,7 +48,7 @@ void concat_ref(const std::shared_ptr<operators::ConcatOpLite> op) {
   }
   int out_rows = rows, out_cols = 0;
   std::vector<int64_t> inputs_cols(inputs.size());
-  for (int i = 0; i < num; ++i) {
+  for (size_t i = 0; i < num; ++i) {
     int t_cols = inputs[i]->numel() / rows;
     out_cols += t_cols;
     inputs_cols[i] = t_cols;
@@ -56,7 +56,7 @@ void concat_ref(const std::shared_ptr<operators::ConcatOpLite> op) {
   for (int k = 0; k < out_rows; ++k) {
     float* dst_ptr = out->mutable_data<float>() + k * out_cols;
     int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
+    for (size_t j = 0; j < num; ++j) {
       int col_len = inputs_cols[j];
       const float* src_prt = inputs[j]->data<float>() + k * col_len;
       std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc
index 6a7ef408eb7432950d5a0985dd6e174236e937e0..e7e21f7ad2f64275746e015289c9372368e46f5c 100644
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
@@ -43,20 +43,20 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   const auto output_shape = output->dims().Vectorize();
   const auto bs = input_dims[0];
   const auto oc = filter_dims[0];
-  CHECK_EQ(input_dims.size(), 4);
-  CHECK_EQ(filter_dims.size(), 4);
+  CHECK_EQ(input_dims.size(), 4u);
+  CHECK_EQ(filter_dims.size(), 4u);
   const auto strides = op_info->GetAttr<std::vector<int>>("strides");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  CHECK_EQ(strides.size(), 2L);
-  CHECK_EQ(dilations.size(), 2L);
-  if (paddings.size() == 2L) {
+  CHECK_EQ(strides.size(), 2u);
+  CHECK_EQ(dilations.size(), 2u);
+  if (paddings.size() == 2u) {
     for (size_t i = 0; i < strides.size(); ++i) {
       int copy_pad = *(paddings.begin() + 2 * i);
       paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
     }
   }
-  CHECK_EQ(paddings.size(), 4L)
+  CHECK_EQ(paddings.size(), 4u)
       << "Paddings size should be the same or twice as the input size.";
 
   const std::string padding_algorithm =
diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc
index e34dd7c2a85dbda62596b6e82d820fc437bfd194..1b04814d7d88d227d0bb3e0b58aef26d62f06966 100644
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
@@ -173,10 +173,10 @@ void test_conv(int bs,
   Tensor input_int;
   input_int.Resize(input_shape);
   FillTensor<int8_t, int8_t>(&input_int, -127, 127);
-  for (int i = 0; i < input->data_size(); i++) {
+  for (size_t i = 0; i < input->data_size(); i++) {
     input->mutable_data<float>()[i] = input_int.data<int8_t>()[i] * input_scale;
   }
-  for (int i = 0; i < filter->data_size(); i++) {
+  for (size_t i = 0; i < filter->data_size(); i++) {
     filter->mutable_data<float>()[i] =
         filter_int->data<int8_t>()[i] * filter_scale;
   }
diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc
index 8f92b6abad97650100d0862d49550abaf62daac9..fe1c889f431350b4175ac400aefe77e6392405c5 100644
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
@@ -97,11 +97,11 @@ void test_fc(const std::vector<int64_t>& input_shape,
   Tensor input_int;
   input_int.Resize(input_shape);
   FillTensor<int8_t, int8_t>(&input_int, -127, 127);
-  for (int i = 0; i < input->data_size(); i++) {
+  for (size_t i = 0; i < input->data_size(); i++) {
     input->mutable_data<float>()[i] = input_int.data<int8_t>()[i] * input_scale;
   }
 
-  for (int i = 0; i < w->data_size(); i++) {
+  for (size_t i = 0; i < w->data_size(); i++) {
     w->mutable_data<float>()[i] = w_int->data<int8_t>()[i] * w_scale;
   }
 
diff --git a/lite/kernels/mlu/bridges/interpolate_op.cc b/lite/kernels/mlu/bridges/interpolate_op.cc
index e201199824d8042abd6002ccbe5bb659a9ca2898..2c1a2aeeff799d31d4328169fce058259543fb1f 100644
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
@@ -36,7 +36,7 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
   auto out = scope->FindVar(out_var_name)->GetMutable<Tensor>();
   auto x_dims = x->dims();
-  CHECK_EQ(x_dims.size(), 4);
+  CHECK_EQ(x_dims.size(), 4u);
   auto scale = op_info->GetAttr<float>("scale");
   auto out_w = op_info->GetAttr<int>("out_w");
   auto out_h = op_info->GetAttr<int>("out_h");
diff --git a/lite/kernels/mlu/bridges/interpolate_op_test.cc b/lite/kernels/mlu/bridges/interpolate_op_test.cc
index 0e99da64358e6590af0b8e57dc3ddec142c8d0f0..13f5bfb31bc2856c2da467e8511cd5d2c973492d 100644
--- a/lite/kernels/mlu/bridges/interpolate_op_test.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc
@@ -85,7 +85,7 @@ void BilinearInterpRef(const lite::Tensor* x,
   int channel_size = x_dims[1];
   auto x_h = x_dims[2];
   auto x_w = x_dims[3];
-  CHECK_EQ(x_dims.size(), 4);
+  CHECK_EQ(x_dims.size(), 4u);
 
   auto out_dims = out->dims();
   int out_h = out_dims[2];
diff --git a/lite/kernels/mlu/bridges/utility.cc b/lite/kernels/mlu/bridges/utility.cc
index f18a46518c09a69803a069ce40c1d7e3c01e9eca..cd78553a652433fc41334a6bff5575031f5125e0 100644
--- a/lite/kernels/mlu/bridges/utility.cc
+++ b/lite/kernels/mlu/bridges/utility.cc
@@ -59,10 +59,10 @@ void dequant(float* dst,
              size_t size,
              size_t size_in,
              std::vector<float> scales) {
-  for (int out = 0; out < size_o; ++out) {
-    for (int s = 0; s < size; ++s) {
+  for (size_t out = 0; out < size_o; ++out) {
+    for (size_t s = 0; s < size; ++s) {
       auto scale = scales[s];
-      for (int in = 0; in < size_in; ++in) {
+      for (size_t in = 0; in < size_in; ++in) {
         int idx = in + s * size_in + out * size_in * size;
         dst[idx] = static_cast<float>(src[idx]) * scale;
       }
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index e53bd60c6bade98992524fe0959e2f80f535a6be..f2974bf6103da4e8470926b4cc1ef07e5530fd2c 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU AND NOT LITE_WITH_APU)
   return()
 endif()
 
@@ -38,6 +38,8 @@ lite_cc_library(subgraph_bridge_shuffle_channel_op_npu SRCS shuffle_channel_op.c
 lite_cc_library(subgraph_bridge_pad2d_op_npu SRCS pad2d_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_reduce_mean_op_npu SRCS reduce_mean_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_gather_op_npu SRCS gather_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_lookup_table_op_npu SRCS lookup_table_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_instance_norm_op_npu SRCS instance_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_dropout_op_npu SRCS dropout_op.cc DEPS ${npu_subgraph_bridge_deps})
@@ -47,6 +49,7 @@ lite_cc_library(subgraph_bridge_fill_constant_op_npu SRCS fill_constant_op.cc DE
 lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_npu SRCS fill_constant_batch_size_like_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_increment_op_npu SRCS increment_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_compare_op_npu SRCS compare_op.cc DEPS ${npu_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_shape_op_npu SRCS shape_op.cc DEPS ${npu_subgraph_bridge_deps})
 
 
 set(npu_subgraph_bridges
@@ -73,6 +76,8 @@ set(npu_subgraph_bridges
         subgraph_bridge_pad2d_op_npu
         subgraph_bridge_reduce_mean_op_npu
         subgraph_bridge_unsqueeze_op_npu
+        subgraph_bridge_gather_op_npu
+        subgraph_bridge_lookup_table_op_npu
         subgraph_bridge_argmax_op_npu
         subgraph_bridge_instance_norm_op_npu
         subgraph_bridge_dropout_op_npu
diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc
index 9961d5f17e285350414f1c8ae72fe19d760312de..6e639a37badf45e4a01f542011f0149e93e06772 100644
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
@@ -30,7 +30,7 @@ int Engine::BuildOriginProgram() {
   // TODO(hong19860320) The block_desc need to be divided into subgraphs during
   // the exection time. But only see them as a subgraph now.
   origin_program_.clear();
-  for (int op_idx = 0; op_idx < block_desc_->OpsSize(); op_idx++) {
+  for (size_t op_idx = 0; op_idx < block_desc_->OpsSize(); op_idx++) {
     auto op_desc = block_desc_->GetOp<cpp::OpDesc>(op_idx);
     CHECK(op_desc);
     std::string op_type = op_desc->Type();
@@ -46,7 +46,7 @@ int Engine::BuildOriginProgram() {
       VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type
               << " for " << op_type;
       auto kernels = op->CreateKernels({place});
-      CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
+      CHECK_GT(kernels.size(), 0u) << "No kernels found for " << op_type;
       auto it = std::find_if(
           kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
             return it->alias() == alias;
@@ -96,7 +96,7 @@ int Engine::Build() {
 }
 
 bool Engine::InputShapeChanged() {
-  for (int i = 0; i < origin_itensors_.size(); i++) {
+  for (size_t i = 0; i < origin_itensors_.size(); i++) {
     if (origin_itensors_[i]->dims() != origin_idims_[i]) {
       return true;
     }
diff --git a/lite/kernels/npu/bridges/gather_op.cc b/lite/kernels/npu/bridges/gather_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08b165f02b4d355336bc8bc094a2ef309f24d48d
--- /dev/null
+++ b/lite/kernels/npu/bridges/gather_op.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+
+  auto index_name = op_info->Input("Index").front();
+  auto index = scope->FindTensor(index_name);
+  auto index_dims = index->dims();
+  CHECK(index_dims.size() == 1 ||
+        (index_dims.size() == 2 && index_dims[1] == 1))
+      << "index dims unmatch";
+
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Index node
+  std::shared_ptr<Node> index_node = nullptr;
+  if (graph->Has(index_name)) {
+    index_node = graph->Get(index_name);
+  } else {
+    index_node = graph->Add(index_name, *index);
+  }
+
+  // Gather node
+  auto gather_node = graph->Add<ge::op::Gather>(out_name);
+  auto gather_op = gather_node->data<ge::op::Gather>();
+  gather_op->set_input_params(*x_node->data());
+  gather_op->set_input_indices(*index_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(gather,
+                         kNPU,
+                         paddle::lite::subgraph::npu::GatherConverter);
diff --git a/lite/kernels/npu/bridges/lookup_table_op.cc b/lite/kernels/npu/bridges/lookup_table_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7356cde339d94cf9319f247aa0a7355c7ce4f5fd
--- /dev/null
+++ b/lite/kernels/npu/bridges/lookup_table_op.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto w_name = op_info->Input("W").front();
+  auto w = scope->FindTensor(w_name);
+
+  auto index_name = op_info->Input("Ids").front();
+  auto index = scope->FindTensor(index_name);
+
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindTensor(out_name);
+  auto out_shape = out->dims().Vectorize();
+
+  // W node
+  std::shared_ptr<Node> w_node = nullptr;
+  if (graph->Has(w_name)) {
+    w_node = graph->Get(w_name);
+  } else {
+    w_node = graph->Add(w_name, *w);
+  }
+
+  // Index node
+  std::shared_ptr<Node> index_node = nullptr;
+  if (graph->Has(index_name)) {
+    index_node = graph->Get(index_name);
+  } else {
+    index_node = graph->Add(index_name, *index);
+  }
+
+  // reshape ids
+  auto reshaped_index_node =
+      graph->Add<ge::op::Reshape>(index_name + "/reshape");
+  auto reshaped_index_op = reshaped_index_node->data<ge::op::Reshape>();
+  reshaped_index_op->set_input_tensor(*index_node->data());
+  reshaped_index_op->set_attr_shape(ge::AttrValue::LIST_INT({index->numel()}));
+  reshaped_index_op->set_attr_axis(0);
+  index_node = reshaped_index_node;
+
+  // Gather node
+  auto gather_node = graph->Add<ge::op::Gather>(out_name);
+  auto gather_op = gather_node->data<ge::op::Gather>();
+  gather_op->set_input_params(*w_node->data());
+  gather_op->set_input_indices(*index_node->data());
+
+  // reshape out
+  auto reshaped_gather_node = graph->Add<ge::op::Reshape>(out_name);
+  auto reshaped_gather_op = reshaped_gather_node->data<ge::op::Reshape>();
+  reshaped_gather_op->set_input_tensor(*gather_node->data());
+  reshaped_gather_op->set_attr_shape(
+      ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+  reshaped_gather_op->set_attr_axis(0);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(lookup_table,
+                         kNPU,
+                         paddle::lite::subgraph::npu::LookupTableConverter);
diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h
index 2b41c36b3c0f7dc0a56049fdb3a154370883836c..5ec7591453f8d9e9ec179856cc21900147236bc3 100644
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -45,6 +45,8 @@ USE_SUBGRAPH_BRIDGE(fusion_elementwise_div_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fill_constant, kNPU)
 USE_SUBGRAPH_BRIDGE(fill_constant_batch_size_like, kNPU)
 
+// USE_SUBGRAPH_BRIDGE(gather, kNPU);
+// USE_SUBGRAPH_BRIDGE(lookup_table, kNPU);
 USE_SUBGRAPH_BRIDGE(increment, kNPU);
 USE_SUBGRAPH_BRIDGE(instance_norm, kNPU);
 USE_SUBGRAPH_BRIDGE(fc, kNPU);
@@ -59,6 +61,7 @@ USE_SUBGRAPH_BRIDGE(reduce_mean, kNPU);
 USE_SUBGRAPH_BRIDGE(reshape, kNPU);
 USE_SUBGRAPH_BRIDGE(reshape2, kNPU);
 USE_SUBGRAPH_BRIDGE(scale, kNPU);
+// USE_SUBGRAPH_BRIDGE(shape, kNPU);
 USE_SUBGRAPH_BRIDGE(shuffle_channel, kNPU);
 USE_SUBGRAPH_BRIDGE(softmax, kNPU);
 USE_SUBGRAPH_BRIDGE(split, kNPU);
diff --git a/lite/kernels/npu/bridges/registry.h b/lite/kernels/npu/bridges/registry.h
index 0694723754dff48ba92081b01ec9ed5e2ab8c4cf..9164c41090e6d4906a522d99a78bfadb1b143f17 100644
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
@@ -58,14 +58,6 @@ class Registry {
 }  // namespace lite
 }  // namespace paddle
 
-// some platform-independent defintion
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
 #define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg)         \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
diff --git a/lite/kernels/npu/bridges/shape_op.cc b/lite/kernels/npu/bridges/shape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bac0232720107207d0be65e9a31bc6a5a9380f0
--- /dev/null
+++ b/lite/kernels/npu/bridges/shape_op.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int ShapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("Input").front();
+  auto x = scope->FindTensor(x_name);
+
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Shape node
+  auto shape_node = graph->Add<ge::op::Shape>(out_name);
+  auto shape_op = shape_node->data<ge::op::Shape>();
+  shape_op->set_input_x(*x_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(shape,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ShapeConverter);
diff --git a/lite/kernels/opencl/activation_buffer_compute.cc b/lite/kernels/opencl/activation_buffer_compute.cc
index 03ccdac99e5f11e1c056374463f7a8068dbd4f56..7ca2c663d1cfd360a72a176182adcaa9c2e9b168 100644
--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
@@ -62,6 +62,7 @@ class ReluCompute
     CL_CHECK_FATAL(status);
 
     auto global_work_size = cl::NDRange{count};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -77,7 +78,7 @@ class ReluCompute
   std::string kernel_func_name_{"relu"};
   std::string build_options_{"-DCL_DTYPE_float -DRELU"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 class SigmoidCompute
@@ -120,6 +121,7 @@ class SigmoidCompute
     CL_CHECK_FATAL(status);
 
     auto global_work_size = cl::NDRange{count};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -135,7 +137,7 @@ class SigmoidCompute
   std::string kernel_func_name_{"sigmoid"};
   std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/activation_image_compute.cc b/lite/kernels/opencl/activation_image_compute.cc
index a99e588eccd79eb35a5e7c0f3da73471849ab581..3b663cbd7d29da19122f2273c802bf47b4e1ebac 100644
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -147,6 +147,7 @@ class ActivationComputeImageDefault
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -174,7 +175,7 @@ class ActivationComputeImageDefault
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl
 }  // namespace kernels
diff --git a/lite/kernels/opencl/bilinear_interp_image_compute.cc b/lite/kernels/opencl/bilinear_interp_image_compute.cc
index 53f260789e12a94dc39f785df12a8e988d08bcbe..d5143da9bd32941e7be5e4d46ca95261e83a9a90 100644
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
@@ -142,6 +142,7 @@ class BilinearInterpImageCompute
                     static_cast<cl::size_type>(default_work_size[1]),
                     static_cast<cl::size_type>(default_work_size[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -162,7 +163,7 @@ class BilinearInterpImageCompute
   std::string kernel_func_name_{"bilinear_interp"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/box_coder_image_compute.cc b/lite/kernels/opencl/box_coder_image_compute.cc
index 81ad858df0834f58b84b55ef594d71442a27f186..d44610faaa4107031a7d225bbeaaf38144f52a17 100644
--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
@@ -120,6 +120,7 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
           cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
                       static_cast<cl::size_type>(default_work_size[2])};
 
+      event_ = std::shared_ptr<cl::Event>(new cl::Event);
       status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
           kernel,
           cl::NullRange,
@@ -141,7 +142,7 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
   param_t* boxcoder_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{" -DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/concat_buffer_compute.cc b/lite/kernels/opencl/concat_buffer_compute.cc
index 414f62ff0c4f86f29756b933817de2a7682ecd4c..aebffe3a5764f7207b47b938ee724424f648a987 100644
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
@@ -123,6 +123,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
       CL_CHECK_FATAL(status);
       status = kernel.setArg(++arg_idx, total1);
       CL_CHECK_FATAL(status);
+      event_ = std::shared_ptr<cl::Event>(new cl::Event);
       status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
           kernel,
           cl::NullRange,
@@ -156,6 +157,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
         CL_CHECK_FATAL(status);
         status = kernel.setArg(++arg_idx, total0);
         CL_CHECK_FATAL(status);
+        event_ = std::shared_ptr<cl::Event>(new cl::Event);
         status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
             kernel,
             cl::NullRange,
@@ -180,7 +182,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_float"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/concat_image_compute.cc b/lite/kernels/opencl/concat_image_compute.cc
index 60d1ac628ab1474d7e82f1861067bca838548569..9d248d0718ee468cbfca032c5270853d78ad8019 100644
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -187,6 +187,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
       CL_CHECK_FATAL(status);
       status = kernel.setArg(++arg_idx, width_);
       CL_CHECK_FATAL(status);
+      event_ = std::shared_ptr<cl::Event>(new cl::Event);
       status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
           kernel,
           cl::NullRange,
@@ -230,6 +231,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
         status = kernel.setArg(++arg_idx, width_);
         CL_CHECK_FATAL(status);
         CL_CHECK_FATAL(status);
+        event_ = std::shared_ptr<cl::Event>(new cl::Event);
         status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
             kernel,
             cl::NullRange,
@@ -254,7 +256,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{};
   std::string build_options_{" -DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/conv_buffer_compute.cc b/lite/kernels/opencl/conv_buffer_compute.cc
index 4c118e1263c0d3c23eb223b01b98a8d9a53bac0e..51e3eab352ef92ae8547e52691afcc8c5889f446 100644
--- a/lite/kernels/opencl/conv_buffer_compute.cc
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
@@ -205,6 +205,7 @@ void ConvCompute::GemmlikeConv2d() {
     CL_CHECK_FATAL(status);
 
     auto global_work_size = cl::NDRange{static_cast<size_t>(out_stride)};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         img2col_kernel,
         cl::NullRange,
@@ -300,6 +301,7 @@ void ConvCompute::GemmBatched(cl::Kernel& kernel,
   status = kernel.setArg(++arg_idx, batch_size);
   CL_CHECK_FATAL(status);
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
diff --git a/lite/kernels/opencl/conv_buffer_compute.h b/lite/kernels/opencl/conv_buffer_compute.h
index 3dabe906f128ef96fb03dfa82ab3847febaeeed5..531ffb5402cee45ddfd4bdd5346ec151c33b217a 100644
--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
@@ -57,7 +57,7 @@ class ConvCompute
   std::vector<std::string> kernel_func_paths_{};
   std::vector<std::string> build_options_{};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index aadd7010cca2ec03ea417e3b486d8c946d80fcab..7b0e26cf43bb081994c3f92931ebfa51f0962bc0 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -38,6 +38,7 @@ void ConvImageCompute::PrepareForRun() {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
 
+  filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
   int bs = x_dims[0];
   int c_in = x_dims[1];
   int h_out = output_dims[2];
@@ -113,7 +114,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d1x1opt;
@@ -174,7 +175,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
 #endif
@@ -194,7 +195,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::DepthwiseConv2d;
@@ -209,7 +210,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d3x3opt;
@@ -241,7 +242,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d5x5;
@@ -257,7 +258,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d5x5opt;
@@ -290,7 +291,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    this->filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    this->filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d7x7;
@@ -306,7 +307,7 @@ void ConvImageCompute::PrepareForRun() {
     std::vector<half_t> filter_image_v(filter_image_dims[0] *
                                        filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    this->filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    this->filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d7x7opt;
@@ -349,6 +350,7 @@ void ConvImageCompute::PrepareForRun() {
   const bool is_element_wise_bias =
       has_bias && param.output->dims() == param.bias->dims();
   if (has_bias) {
+    bias_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
     build_options_single +=
         is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
 
@@ -361,7 +363,7 @@ void ConvImageCompute::PrepareForRun() {
     float* bias_cpu_data = param.bias->mutable_data<float>();
     bias_converter.NCHWToImage(
         bias_cpu_data, bias_image_v.data(), param.bias->dims());
-    this->bias_gpu_image_.mutable_data<half_t, cl::Image2D>(
+    this->bias_gpu_image_->mutable_data<half_t, cl::Image2D>(
         bias_image_dims[0], bias_image_dims[1], bias_image_v.data());
     // convert cpu buffer bias --> gpu image --- end ----
   }
@@ -434,7 +436,7 @@ void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
   auto paddings = *param.paddings;
   auto strides = param.strides;
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -498,7 +500,7 @@ void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
   const cl::Buffer* bias_buf = nullptr;
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
   auto kernel = kernel_;
@@ -542,6 +544,7 @@ void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
   status = kernel.setArg(++arg_idx, default_w_blk_);
   CL_CHECK_FATAL(status);
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
@@ -565,7 +568,7 @@ void ConvImageCompute::Conv2d3x3(bool is_turn) {
   auto strides = param.strides;
 
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -647,7 +650,7 @@ void ConvImageCompute::Conv2d3x3(bool is_turn) {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
   auto kernel = kernel_;
 
@@ -707,6 +710,7 @@ void ConvImageCompute::Conv2d3x3(bool is_turn) {
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
@@ -732,7 +736,7 @@ void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
   auto dilations = *param.dilations;
 
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -781,7 +785,7 @@ void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
   auto kernel = kernel_;
@@ -834,6 +838,7 @@ void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
@@ -856,7 +861,7 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) {
   auto paddings = *param.paddings;
   auto strides = param.strides;
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -914,7 +919,7 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
   auto kernel = kernel_;
@@ -965,6 +970,7 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) {
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
@@ -989,7 +995,7 @@ void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
   auto dilations = *param.dilations;
 
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -1039,7 +1045,7 @@ void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
   auto kernel = kernel_;
@@ -1084,6 +1090,7 @@ void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
 
   //  VLOG(4) << "out_image: " << out_image;
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
@@ -1106,7 +1113,7 @@ void ConvImageCompute::Conv2d7x7(bool is_turn) {
   auto paddings = *param.paddings;
   auto strides = param.strides;
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -1164,7 +1171,7 @@ void ConvImageCompute::Conv2d7x7(bool is_turn) {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
   auto kernel = kernel_;
@@ -1215,6 +1222,7 @@ void ConvImageCompute::Conv2d7x7(bool is_turn) {
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
@@ -1239,7 +1247,7 @@ void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
   auto dilations = *param.dilations;
 
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -1287,7 +1295,7 @@ void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
   auto kernel = kernel_;
@@ -1331,6 +1339,7 @@ void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
@@ -1357,11 +1366,11 @@ void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
   auto dilations = *param.dilations;
 
   auto* input_img = param.x->data<half_t, cl::Image2D>();
-  auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_->data<half_t, cl::Image2D>();
 
   const cl::Image2D* bias_img = nullptr;
   if (param.bias) {
-    bias_img = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_img = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
   auto image_shape = InitImageDimInfoWith(output_dims);
@@ -1389,7 +1398,7 @@ void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
       has_bias && param.output->dims() == param.bias->dims();
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
 #ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
 #endif
@@ -1415,6 +1424,7 @@ void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
   status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
   CL_CHECK_FATAL(status);
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
@@ -1444,11 +1454,11 @@ void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
   int input_c_block = (x_dims[1] + 3) / 4;
 
   auto* input_img = param.x->data<half_t, cl::Image2D>();
-  auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_->data<half_t, cl::Image2D>();
 
   const cl::Image2D* bias_img = nullptr;
   if (param.bias) {
-    bias_img = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_img = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
   auto image_shape = InitImageDimInfoWith(output_dims);
@@ -1487,7 +1497,7 @@ void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
       has_bias && param.output->dims() == param.bias->dims();
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
 #ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
 #endif
@@ -1513,6 +1523,7 @@ void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
   status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
   CL_CHECK_FATAL(status);
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
@@ -1536,7 +1547,7 @@ void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
   auto paddings = *param.paddings;
   auto strides = param.strides;
   auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -1595,7 +1606,7 @@ void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
   const cl::Buffer* bias_buf = nullptr;
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
 
   auto kernel = kernel_;
@@ -1650,6 +1661,7 @@ void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h
index 6f293a0d7dd90e55bedd63c214ba38799a591080..fbc659b50d55b3289209228e2ac52a3a19504d57 100644
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -58,9 +58,9 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
   std::vector<std::string> kernel_func_paths_{};
   std::vector<std::string> build_options_{};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-  Tensor filter_gpu_image_;
-  Tensor bias_gpu_image_;
+  std::shared_ptr<cl::Event> event_{nullptr};
+  std::unique_ptr<Tensor> filter_gpu_image_{nullptr};
+  std::unique_ptr<Tensor> bias_gpu_image_{nullptr};
   cl::NDRange global_work_size_ = cl::NDRange{
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   int c_blk_ = 1;
diff --git a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
index afe2aa1c66c04d2bdf180a77362e5d6f1271c1f6..5f7950b060ac77b7d28053ef209c26b9bd9cf24f 100644
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
@@ -108,6 +108,7 @@ class DepthwiseConv2dCompute
     status = kernel.setArg(++arg_idx, *bias_buf);
     CL_CHECK_FATAL(status);
     auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -123,7 +124,7 @@ class DepthwiseConv2dCompute
   std::string kernel_func_name_{"depthwise_conv2d"};
   std::string build_options_{"-DCL_DTYPE_float"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/dropout_image_compute.cc b/lite/kernels/opencl/dropout_image_compute.cc
index 2be5af2ef0bf3e30d1c586d57ed6c3d40d625b14..27c7ebaa5a3f2abee2fc58cf3e137fe250ddd6bf 100644
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
@@ -89,6 +89,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size.data()[1]),
                     static_cast<cl::size_type>(default_work_size.data()[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -104,7 +105,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{"dropout"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.cc b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
index b70f7d1ee017566e399ac86d35df56bd4ba4d383..f8ae61bacbba9a9595b96435e47d36107d8fc74a 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
@@ -63,6 +63,7 @@ void ElementwiseAddCompute::Run() {
   CL_CHECK_FATAL(status);
 
   auto global_work_size = cl::NDRange{channels_, batch_};
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.h b/lite/kernels/opencl/elementwise_add_buffer_compute.h
index 7dbe5d0e8d5172386418d547812bf4e6c269f043..4a26e283fd02a4f7c4f7ade20de79a3fe7838019 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
@@ -48,7 +48,7 @@ class ElementwiseAddCompute
   std::string kernel_func_name_{"elementwise_add"};
   std::string build_options_{"-DCL_DTYPE_float"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc
index 51d488d51b72dd9af8225b45a7ee56063312d055..3b848954439d95eaa39616a22e6c6af67dc7d5fa 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -153,6 +153,7 @@ void ElementwiseAddImageCompute::Run() {
 
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.h b/lite/kernels/opencl/elementwise_add_image_compute.h
index a92a1b448176628381a3c65b838f6bba529eb4e0..196e3c499e700022c56f8cae919c67235e7b09db 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
@@ -63,7 +63,7 @@ class ElementwiseAddImageCompute
   cl::Kernel kernel_;
   cl::NDRange global_work_size_ = cl::NDRange{
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_mul_compute.cc b/lite/kernels/opencl/elementwise_mul_compute.cc
index d096f05278e7dbc2187dd4aaf1b5e945e5b2f395..19d8cfa03668cbfc7ffb951479ae7d84c1fc03c0 100644
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
@@ -150,7 +150,8 @@ void ElementwiseMulFloatImageCompute::Run() {
 
   auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
                                       static_cast<cl::size_type>(x_img_height)};
-  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
+  auto  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel
       kernel,
       cl::NullRange,
       global_work_size,
diff --git a/lite/kernels/opencl/elementwise_mul_image_compute.cc b/lite/kernels/opencl/elementwise_mul_image_compute.cc
index 96dc2de1affba7c36be6c9c0e952b85be726fca8..23b0a20ba39b0890ee10dc03b6e80756f5724419 100644
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -185,6 +185,7 @@ class ElementwiseMulImageCompute
     auto global_work_size =
         cl::NDRange{static_cast<cl::size_type>(x_img_width),
                     static_cast<cl::size_type>(x_img_height)};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -204,7 +205,7 @@ class ElementwiseMulImageCompute
   std::string kernel_func_name_{"elementwise_mul"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.cc b/lite/kernels/opencl/elementwise_sub_image_compute.cc
index b93167b99c064a2f9eb2256291adad99f3912baf..33cb55b6966bb7e04070289614ac83cc898f05c4 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
@@ -138,6 +138,7 @@ void ElementwiseSubImageCompute::Run() {
   VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
 #endif
 
+  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.h b/lite/kernels/opencl/elementwise_sub_image_compute.h
index db3e1db9813bffd985a41abbac14e5c89e574397..29507a9775aafe202bcdf58187966317a1902ff3 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
@@ -46,7 +46,7 @@ class ElementwiseSubImageCompute
   std::string kernel_func_name_{"elementwise_sub"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc
index 0fb83db2fe76e27baf7a096395369cb92b995072..1e24020f2ad3f4a6f8dda4348c9a291b7a04868f 100644
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -123,6 +123,7 @@ class FcCompute
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -144,7 +145,7 @@ class FcCompute
   DDim last_x_dims_;
   cl::NDRange global_work_size_;
   cl::Kernel kernel_;
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/fc_buffer_compute_test.cc b/lite/kernels/opencl/fc_buffer_compute_test.cc
index 7f0c9c49a9920b10ceaa29cd1b548f59d5758f3b..78dfbdffb965e82fde23d0a03e87cccce7812a17 100644
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
@@ -17,6 +17,9 @@
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (1e-2)
 
 namespace paddle {
 namespace lite {
@@ -67,24 +70,28 @@ void PrintData(std::string name, float* a, const int rows, const int cols) {
 }
 
 // #define PRINT_RESULT
-#define LOOP_TEST
+// #define LOOP_TEST
 TEST(fc, compute) {
   std::unique_ptr<KernelContext> context(new KernelContext);
   context->As<OpenCLContext>().InitOnce();
 
 #ifdef LOOP_TEST
-  for (int m = 1; m < 213; m += 71) {
-    for (int k = 1; k < 123; k += 31) {
-      for (int n = 1; n < 123; n += 121) {
+  for (int m = 1; m < 4; m += 1) {
+    for (int k = 1; k < 4; k += 1) {
+      for (int n = 1; n < 4; n += 1) {
 #else
 #if 0
   const int m = 1;
   const int k = 1024;
   const int n = 1000;
 #else
-  const int m = 2;
-  const int k = 3;
-  const int n = 1;
+  // m,k,n:2,3,1
+  //       1,2,3
+  //       2,1,3
+  //       1,2,3
+  const int m = 1;
+  const int k = 2;
+  const int n = 3;
 #endif
 #endif
         LOG(INFO) << "m=" << m << " n=" << n << " k=" << k;
@@ -122,27 +129,40 @@ TEST(fc, compute) {
         auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
         auto* w_data = w.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
         auto* bias_data = bias.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+        auto* out_data = out.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
         std::default_random_engine engine;
         std::uniform_real_distribution<float> dist(-5, 5);
-        auto* mapped_x = static_cast<float*>(TargetWrapperCL::Map(
-            x_data, 0, sizeof(float) * x_dim.production()));
-        for (int i = 0; i < x_dim.production(); ++i) {
-          mapped_x[i] = static_cast<int>(dist(engine));
+
+        std::vector<float> x_source(x_dim.production());
+        std::vector<float> w_source(w_dim.production());
+        std::vector<float> bias_source(bias_dim.production());
+
+        size_t x_size = x_dim.production() * sizeof(float);
+        size_t w_size = w_dim.production() * sizeof(float);
+        size_t bias_size = bias_dim.production() * sizeof(float);
+        size_t out_size = out_dim.production() * sizeof(float);
+
+        for (size_t i = 0; i < x_dim.production(); ++i) {
+          x_source[i] = static_cast<int>(dist(engine));
         }
-        auto* mapped_w = static_cast<float*>(TargetWrapperCL::Map(
-            w_data, 0, sizeof(float) * w_dim.production()));
-        for (int i = 0; i < w_dim.production(); ++i) {
-          mapped_w[i] = static_cast<int>((dist(engine)));
+        for (size_t i = 0; i < w_dim.production(); ++i) {
+          w_source[i] = static_cast<int>(dist(engine));
         }
-        auto* mapped_bias = static_cast<float*>(TargetWrapperCL::Map(
-            bias_data, 0, sizeof(float) * bias_dim.production()));
-        for (int i = 0; i < bias_dim.production(); ++i) {
-          mapped_bias[i] = static_cast<int>(/*(dist(engine))*/ 1);
+        for (size_t i = 0; i < bias_dim.production(); ++i) {
+          bias_source[i] = 10;  // static_cast<int>(dist(engine));
         }
 
+        TargetWrapperCL::MemcpySync(
+            x_data, x_source.data(), x_size, IoDirection::HtoD);
+        TargetWrapperCL::MemcpySync(
+            w_data, w_source.data(), w_size, IoDirection::HtoD);
+        TargetWrapperCL::MemcpySync(
+            bias_data, bias_source.data(), bias_size, IoDirection::HtoD);
+
         // run opencl kernel
         kernel->Launch();
+        //       kernel->Launch();
 
         auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
         auto* out_ptr = param.output->data<float, cl::Buffer>();
@@ -151,42 +171,64 @@ TEST(fc, compute) {
           VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
           auto& event = *(it->second);
           event.wait();
+          auto command_queue = CLRuntime::Global()->command_queue();
+          command_queue.finish();
+#if 0
           double start_nanos =
               event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
           double stop_nanos =
               event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
           double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
           LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
+#endif
         } else {
           LOG(FATAL)
               << "Could not find the sync event for the target cl tensor.";
         }
 
+        std::vector<float> out_data_from_gpu(out_dim.production());
+        TargetWrapperCL::MemcpySync(
+            out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH);
+
         // run cpu ref
         auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-        gemm_bias<float>(
-            mapped_x, m, k, mapped_w, k, n, mapped_bias, out_ref_data);
-
-        auto* out_data = out.mutable_data<float, cl::Buffer>();
-        auto* mapped_out = static_cast<float*>(TargetWrapperCL::Map(
-            out_data, 0, sizeof(float) * out_dim.production()));
-
+        gemm_bias<float>(x_source.data(),
+                         m,
+                         k,
+                         w_source.data(),
+                         k,
+                         n,
+                         bias_source.data(),
+                         out_ref_data);
 #ifdef PRINT_RESULT
-        PrintData("mapped_x", static_cast<float*>(mapped_x), m, k);
-        PrintData("mapped_w", static_cast<float*>(mapped_w), k, n);
-        PrintData("mapped_bias", static_cast<float*>(mapped_bias), 1, n);
+        PrintData("x", static_cast<float*>(x_source.data()), m, k);
+        PrintData("w", static_cast<float*>(w_source.data()), k, n);
+        PrintData("bias", static_cast<float*>(bias_source.data()), 1, n);
         PrintData("out_ref_data", static_cast<float*>(out_ref_data), m, n);
-        PrintData("mapped_out", static_cast<float*>(mapped_out), m, n);
+        PrintData(
+            "gpu_out", static_cast<float*>(out_data_from_gpu.data()), m, n);
 #endif
 
-        for (int i = 0; i < out_dim.production(); i++) {
-          EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
+        for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
+          auto abs_diff = COMPUTE_ABS_DIFF(out_ref_data[eidx],
+                                           out_data_from_gpu.data()[eidx]);
+          auto relative_diff = COMPUTE_RELATIVE_DIFF(
+              out_ref_data[eidx], out_data_from_gpu.data()[eidx]);
+          // EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
+          //              (abs_diff <= FP16_MAX_DIFF),
+          //          true);
+          if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+            LOG(ERROR) << "error idx:" << eidx << ", out_ref_data[" << eidx
+                       << "]:" << out_ref_data[eidx]
+                       << ", out_data_from_gpu.data()[" << eidx
+                       << "]:" << out_data_from_gpu.data()[eidx]
+                       << " abs_diff:" << abs_diff
+                       << " relative_diff:" << relative_diff
+                       << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+            return;
+          }
         }
 
-        TargetWrapperCL::Unmap(x_data, mapped_x);
-        TargetWrapperCL::Unmap(w_data, mapped_w);
-        TargetWrapperCL::Unmap(bias_data, mapped_bias);
-        TargetWrapperCL::Unmap(out_data, mapped_out);
 #ifdef LOOP_TEST
       }  // n
     }    // k
diff --git a/lite/kernels/opencl/grid_sampler_image_compute.cc b/lite/kernels/opencl/grid_sampler_image_compute.cc
index 4fb13a61181ba282f7005ea158768ee18b94b7a0..0d2cc348960ff5ef6412bf58dd7ce9a4f2ecc19d 100644
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -130,6 +130,7 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -153,7 +154,7 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/instance_norm_image_compute.cc b/lite/kernels/opencl/instance_norm_image_compute.cc
index c5e02ae0ed4ae9facf36747d99ee825e6eab6515..41acb5f8d457d047c0396c563006b4b4a31268b8 100644
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
@@ -137,6 +137,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     status = kernel.setArg(7, *out_img);
     CL_CHECK_FATAL(status);
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -259,6 +260,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     status = kernel.setArg(arg_idx++, in_w);
     CL_CHECK_FATAL(status);
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -276,7 +278,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{"instance_norm_onnx"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
   Tensor scale_image_;
   Tensor bias_image_;
 };
diff --git a/lite/kernels/opencl/io_copy_buffer_compute.cc b/lite/kernels/opencl/io_copy_buffer_compute.cc
index f76f667923fa8d39847db5dae8e07d7398f25f99..eaabf7f37bf1434e8a451fc797e72d706e68ce5b 100644
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -115,7 +115,6 @@ class IoCopykOpenCLToHostCompute
       auto& event = *(it->second);
       event.wait();
       auto command_queue = CLRuntime::Global()->command_queue();
-      command_queue.flush();
       command_queue.finish();
     } else {
       LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
diff --git a/lite/kernels/opencl/layout_image_compute.cc b/lite/kernels/opencl/layout_image_compute.cc
index 22b3533e123bc248b0ec59df593cd51fe0ad1391..e35cd6e5fb59cfada85fb5beaff758d6262f51b4 100644
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
@@ -122,6 +122,7 @@ class LayoutComputeBufferChwToImageDefault
         cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
                     static_cast<cl::size_type>(new_dims[3]),
                     static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -141,7 +142,7 @@ class LayoutComputeBufferChwToImageDefault
  private:
   std::string kernel_func_name_{"buffer_to_image2d"};
   std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 // [ImageDefault] -> [NCHW]
@@ -229,6 +230,7 @@ class LayoutComputeImageDefaultToBufferChw
         cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
                     static_cast<cl::size_type>(new_dims[3]),
                     static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -248,7 +250,7 @@ class LayoutComputeImageDefaultToBufferChw
  private:
   std::string kernel_func_name_{"image2d_to_buffer"};
   std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 // [NCHW] -> [ImageDW]
@@ -323,6 +325,7 @@ class LayoutComputeBufferChwToImage2DNw
         cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4),  // N blocks
                     static_cast<cl::size_type>(out_W),            // w
                     static_cast<cl::size_type>(out_C * out_H)};   // ch
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -341,7 +344,7 @@ class LayoutComputeBufferChwToImage2DNw
  private:
   std::string kernel_func_name_{"buffer_to_image2d_nw"};
   std::string build_options_{"-DCL_DTYPE_float "};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/lrn_image_compute.cc b/lite/kernels/opencl/lrn_image_compute.cc
index 0e01bdc107c4fcb4a0caf943cfb1b768557dd671..1595987495f4a37ec89a8c9f91e9403c72c45b79 100644
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
@@ -128,6 +128,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size[1]),
                     static_cast<cl::size_type>(default_work_size[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -153,7 +154,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{"lrn"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/mul_buffer_compute.cc b/lite/kernels/opencl/mul_buffer_compute.cc
index e8edb359898fb47cf47919a25e521ca9f8353104..4ca760b76087112f111f6be71a99c888493c39a1 100644
--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
@@ -91,6 +91,7 @@ class MulCompute
 
     auto global_work_size = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
                                         static_cast<size_t>((n_ + 3) / 4)};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -107,7 +108,7 @@ class MulCompute
   std::string kernel_func_name_{"mat_mul"};
   std::string build_options_{"-DCL_DTYPE_float"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/nearest_interp_image_compute.cc b/lite/kernels/opencl/nearest_interp_image_compute.cc
index 17637e2569556d1eeb8b6002c0073223345ac7ec..b61b585e441e9e39ca0fbbec4f7f20c28614df43 100644
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
@@ -109,6 +109,7 @@ class NearestInterpComputeImageDefault
         cl::NDRange{static_cast<cl::size_type>(default_work_size.data()[0]),
                     static_cast<cl::size_type>(default_work_size.data()[1]),
                     static_cast<cl::size_type>(default_work_size.data()[2])};
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -124,7 +125,7 @@ class NearestInterpComputeImageDefault
   std::string kernel_func_name_{"nearest_interp"};
   std::string build_options_{" -DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/pad2d_image_compute.cc b/lite/kernels/opencl/pad2d_image_compute.cc
index f16642d449d29c2afd3db7097432945c73d107e3..a22622af1ee79ffce5ecdee278482e5e96f482cf 100644
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
@@ -142,6 +142,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size[1]),
                     static_cast<cl::size_type>(default_work_size[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -162,7 +163,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/pool_buffer_compute.cc b/lite/kernels/opencl/pool_buffer_compute.cc
index aeba4bcd2ea1d9b1f14ac86509ab9dbec2509ad0..7de86869ed37940756abde15c825da85924b5b3f 100644
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
@@ -105,6 +105,7 @@ class PoolCompute
     status = kernel.setArg(++arg_idx, *output_buf);
     CL_CHECK_FATAL(status);
     auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -120,7 +121,7 @@ class PoolCompute
   std::string kernel_func_name_{"pool_"};
   std::string build_options_{"-DCL_DTYPE_float"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/pool_image_compute.cc b/lite/kernels/opencl/pool_image_compute.cc
index 34524122c8e475df63db02eae32b7d100abfa2d9..83f9107d31cdfa3f73a98e08126b792bde828383 100644
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
@@ -150,6 +150,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
     CL_CHECK_FATAL(status);
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -165,7 +166,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{"pool_"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/reshape_image_compute.cc b/lite/kernels/opencl/reshape_image_compute.cc
index febb1c33d9c4df2cb58580a03bda1eff93ed4da7..9feffed20461dc49a5d95c7b3092eb195e1e0dc6 100644
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
@@ -154,6 +154,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
                     static_cast<size_t>(default_work_size.data()[1]),
                     static_cast<size_t>(default_work_size.data()[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -169,7 +170,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{"reshape"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/scale_image_compute.cc b/lite/kernels/opencl/scale_image_compute.cc
index 97b56e68d47fcdf1647433f5e267c264fb36c5c2..4f5b7f754686eada24a0cc3389e73b06218a0f94 100644
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
@@ -93,6 +93,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
     status = kernel.setArg(3, bias);
     CL_CHECK_FATAL(status);
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -108,7 +109,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{"scale"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 
   param_t* scale_param_{nullptr};
   cl::Kernel kernel_;
diff --git a/lite/kernels/opencl/slice_image_compute.cc b/lite/kernels/opencl/slice_image_compute.cc
index dd231ec8647ba88ab0f953661af47bc36c948e8b..b9f1da22578a51c69b625af62cac1260f2650ba2 100644
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
@@ -96,6 +96,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size.data()[1]),
                     static_cast<cl::size_type>(default_work_size.data()[2])};
 
+    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -111,7 +112,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{"slice"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
+  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/rknpu/CMakeLists.txt b/lite/kernels/rknpu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ebb432748f363fb6326dc7d06ced5a5238061637
--- /dev/null
+++ b/lite/kernels/rknpu/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(bridges)
+add_kernel(subgraph_compute_rknpu RKNPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_rknpu ${rknpu_subgraph_bridges})
diff --git a/lite/kernels/rknpu/bridges/CMakeLists.txt b/lite/kernels/rknpu/bridges/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4c4801553df8c9bf17eea595fce29206c24aa0cd
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/CMakeLists.txt
@@ -0,0 +1,34 @@
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+
+lite_cc_library(subgraph_bridge_utility_rknpu SRCS utility.cc DEPS ${rknpu_builder_libs} tensor)
+lite_cc_library(subgraph_bridge_graph_rknpu SRCS graph.cc DEPS subgraph_bridge_utility_rknpu)
+
+set(rknpu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_rknpu subgraph_bridge_graph_rknpu)
+
+lite_cc_library(subgraph_bridge_conv_op_rknpu SRCS conv_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_act_op_rknpu SRCS act_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_softmax_op_rknpu SRCS softmax_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_rknpu SRCS pool_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_fc_op_rknpu SRCS fc_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_rknpu SRCS batch_norm_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_rknpu SRCS concat_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_rknpu SRCS elementwise_ops.cc DEPS ${rknpu_subgraph_bridge_deps})
+
+
+set(rknpu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_rknpu
+        subgraph_bridge_graph_rknpu
+        subgraph_bridge_conv_op_rknpu
+        subgraph_bridge_act_op_rknpu
+        subgraph_bridge_softmax_op_rknpu
+        subgraph_bridge_pool_op_rknpu
+        subgraph_bridge_fc_op_rknpu
+        subgraph_bridge_batch_norm_op_rknpu
+        subgraph_bridge_concat_op_rknpu
+        subgraph_bridge_elementwise_ops_rknpu
+        CACHE INTERNAL "rknpu_subgraph_bridges")
+
+message(STATUS "+++++ rknpu_subgraph_bridges: ${rknpu_subgraph_bridges}")
diff --git a/lite/kernels/rknpu/bridges/act_op.cc b/lite/kernels/rknpu/bridges/act_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..decc9b46d653594d7e5eaa53766d43dc841b14b5
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/act_op.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+// #include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  CHECK_EQ(op_type, "relu");
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_var_name)) {
+    x_node = graph->Get(x_var_name);
+  } else {
+    x_node = graph->Add(x_var_name, *x, x_type->precision(), x_type->layout());
+  }
+
+  auto output_node = graph->Add(
+      output_var_name, *output, out_type->precision(), out_type->layout());
+  auto rGraph = graph->GetHandle();
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  outputs.push_back(output_node->data());
+  auto relu =
+      rGraph->AddOperator(rk::nn::OperatorType::RELU, inputs, outputs, nullptr);
+
+  return SUCCESS;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(relu,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ActConverter);
diff --git a/lite/kernels/rknpu/bridges/batch_norm_op.cc b/lite/kernels/rknpu/bridges/batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ad892e3b8073862abede3d01e25e9b51c005631
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/batch_norm_op.cc
@@ -0,0 +1,137 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto scale_name = op_info->Input("Scale").front();
+  auto scale_type = kernel->GetInputDeclType("Scale");
+  CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
+  auto scale = scope->FindMutableTensor(scale_name);
+  auto bias_name = op_info->Input("Bias").front();
+  auto bias_type = kernel->GetInputDeclType("Bias");
+  CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+  auto bias = scope->FindMutableTensor(bias_name);
+  auto mean_name = op_info->Input("Mean").front();
+  auto mean_type = kernel->GetInputDeclType("Mean");
+  CHECK(mean_type->layout() == DATALAYOUT(kNCHW));
+  auto mean = scope->FindMutableTensor(mean_name);
+  auto variance_name = op_info->Input("Variance").front();
+  auto variance_type = kernel->GetInputDeclType("Variance");
+  CHECK(variance_type->layout() == DATALAYOUT(kNCHW));
+  auto variance = scope->FindMutableTensor(variance_name);
+  auto y_name = op_info->Output("Y").front();
+  auto y_type = kernel->GetOutputDeclType("Y");
+  auto y = scope->FindMutableTensor(y_name);
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  float momentum = op_info->GetAttr<float>("momentum");
+  float epsilon = op_info->GetAttr<float>("epsilon");
+  int mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
+  bool use_global_stats = op_info->GetAttr<bool>("use_global_stats");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Scale, Bias, Mean, Variance node
+  auto scale_node = graph->Add(scale_name, *scale);
+  auto bias_node = graph->Add(bias_name, *bias);
+  auto mean_node = graph->Add(mean_name, *mean);
+  auto variance_node = graph->Add(variance_name, *variance);
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    y->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(y_name, *y, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  inputs.push_back(mean_node->data());
+  inputs.push_back(variance_node->data());
+  inputs.push_back(scale_node->data());
+  inputs.push_back(bias_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::BatchNormAttr attrs;
+  attrs.eps = epsilon;
+
+  auto rGraph = graph->GetHandle();
+  auto bn = rGraph->AddOperator(
+      rk::nn::OperatorType::BATCH_NORM, inputs, outputs, &attrs);
+
+  return SUCCESS;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::BatchNormConverter);
diff --git a/lite/kernels/rknpu/bridges/concat_op.cc b/lite/kernels/rknpu/bridges/concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..382d7c3a6038cd2bd0998debf157ee494f24de91
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/concat_op.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " << op_type << " ... ";
+
+  // Get input and output vars and op attributes
+  auto x_names = op_info->Input("X");
+  auto x_type = kernel->GetInputDeclType("X");
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  auto output = scope->FindMutableTensor(out_name);
+
+  auto axis = op_info->GetAttr<int>("axis");
+  auto num = x_names.size();
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // Traverse all of input nodes which are added into the new created concat
+  // node
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  int idx = 1;
+  for (auto& x_name : x_names) {
+    auto x = scope->FindMutableTensor(x_name);
+    auto x_dims = x->dims();
+    std::shared_ptr<Node> x_node = nullptr;
+    if (graph->Has(x_name)) {
+      x_node = graph->Get(x_name);
+    } else {
+      x_node = graph->Add(x_name, *x);
+      QuantizationInfo qnt;
+      qnt.enable_int8 = enable_int8;
+
+      if (enable_int8) {
+        qnt.quant_bits = bit_length;
+        qnt.scale.push_back(input_scale);
+        x->mutable_data<int8_t>();
+      }
+      x_node =
+          graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
+    }
+
+    inputs.push_back(x_node->data());
+    idx++;
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+  outputs.push_back(output_node->data());
+
+  rk::nn::ConcatAttr attrs;
+  attrs.axis = axis;
+
+  auto rGraph = graph->GetHandle();
+  auto concat = rGraph->AddOperator(
+      rk::nn::OperatorType::CONCAT, inputs, outputs, &attrs);
+
+  return SUCCESS;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ConcatConverter);
diff --git a/lite/kernels/rknpu/bridges/conv_op.cc b/lite/kernels/rknpu/bridges/conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d474f0ef10771b1f8a0fdc6c3446c97eff261ec
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/conv_op.cc
@@ -0,0 +1,292 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include <algorithm>
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " << op_type << "... ";
+
+  // Get input and output vars and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+  auto output_name = op_info->Output("Output").front();
+  auto output = scope->FindMutableTensor(output_name);
+  auto output_dims = output->dims();
+  auto bs = input_dims[0];
+  auto ic = input_dims[1];
+  auto oc = filter_dims[0];
+  CHECK_EQ(input_dims.size(), 4L);
+  CHECK_EQ(output_dims.size(), 4L);
+  CHECK_EQ(filter_dims.size(), 4L);
+  CHECK_EQ(output_dims[0], bs);
+  CHECK_EQ(output_dims[1], oc);
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+  // Check depthwise mode
+  bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1);
+  auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // // Input node
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.clear();
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = bit_length;
+    }
+    input_node =
+        graph->Add(input_name, *input, input->precision(), layout, qnt);
+  }
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[NPU] Paddings size should be the same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+  // Filter node
+  std::shared_ptr<Node> filter_node = nullptr;
+  QuantizationInfo filter_qnt;
+
+  filter_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    filter_qnt.scale = weight_scale;
+    filter_qnt.quant_bits = bit_length;
+  }
+
+  filter_node =
+      graph->Add(filter_name, *filter, filter->precision(), layout, filter_qnt);
+
+  // Add bias node if exists bias
+  // Supports the bias nodes with the following dimensions
+  // 0: {oc}
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      auto output_data_size = output_dims.production();
+      std::vector<int64_t> bias_shape;
+
+      if (bias_data_size == oc) {
+        // 0: {oc}
+        bias_shape = {oc};
+      } else {
+        LOG(WARNING)
+            << "[RKNPU] Bias dimension " << bias_dims
+            << " isn't supported in conv2d Op when output dimension is "
+            << output_dims;
+        return FAILED;
+      }
+
+      if (enable_int8) {
+        auto bias_name_qnt = bias_name + "/qnt";
+        auto* bias_qnt = scope->NewTensor(bias_name_qnt);
+
+        bias_qnt->Resize(bias_shape);
+        bias_qnt->set_persistable(true);
+        bias_qnt->set_precision(PrecisionType::kInt32);
+
+        auto* bias_qnt_data = bias_qnt->mutable_data<int32_t>();
+        auto* bias_data = bias->mutable_data<float>();
+
+        QuantizationInfo qnt;
+        qnt.enable_int8 = enable_int8;
+        qnt.quant_bits = 32;
+
+        qnt.scale.resize(weight_scale.size());
+        for (int i = 0; i < weight_scale.size(); i++) {
+          qnt.scale[i] = input_scale * weight_scale[i];
+        }
+
+        auto dtype_max = static_cast<int>((1 << (qnt.quant_bits - 1)) - 1);
+        auto dtype_min = static_cast<int>(0 - dtype_max);
+
+        for (int i = 0; i < oc; i++) {
+          bias_qnt_data[i] =
+              std::min(std::max(static_cast<int>(bias_data[i] / qnt.scale[i]),
+                                dtype_min),
+                       dtype_max);
+        }
+
+        bias_node = graph->Add(
+            bias_name, *bias_qnt, bias_qnt->precision(), layout, qnt);
+      } else {
+        bias_node = graph->Add(bias_name, *bias, bias_shape);
+      }
+    }
+  } else {
+    auto bias_name = filter_name + "/bias/dummy";
+    auto* bias = scope->NewTensor(bias_name);
+    std::vector<int64_t> bias_shape = {oc};
+
+    bias->Resize(bias_shape);
+    bias->set_persistable(true);
+
+    if (enable_int8) {
+      bias->set_precision(PrecisionType::kInt32);
+      auto* bias_data = bias->mutable_data<int32_t>();
+
+      for (int i = 0; i < oc; i++) {
+        bias_data[i] = 0;
+      }
+
+      QuantizationInfo qnt;
+      qnt.enable_int8 = enable_int8;
+      qnt.quant_bits = 32;
+      qnt.scale.resize(weight_scale.size());
+      for (int i = 0; i < weight_scale.size(); i++) {
+        qnt.scale[i] = input_scale * weight_scale[i];
+      }
+
+      bias_node = graph->Add(bias_name, *bias, bias->precision(), layout, qnt);
+    } else {
+      bias->set_precision(PrecisionType::kFloat);
+      auto* bias_data = bias->mutable_data<float>();
+
+      for (int i = 0; i < oc; i++) {
+        bias_data[i] = 0.0;
+      }
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
+    }
+  }
+
+  // Conv node
+  std::shared_ptr<Node> conv_node = nullptr;
+  std::shared_ptr<Node> output_node = nullptr;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(output_name, *output, precision, layout, output_qnt);
+
+  inputs.push_back(input_node->data());
+  inputs.push_back(filter_node->data());
+  inputs.push_back(bias_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::Conv2DAttr attr;
+  attr.ksize[0] = filter_dims[2];
+  attr.ksize[1] = filter_dims[3];
+  attr.stride[0] = strides[0];
+  attr.stride[1] = strides[1];
+  attr.pad[0] = paddings[0];
+  attr.pad[1] = paddings[1];
+  attr.pad[2] = paddings[2];
+  attr.pad[3] = paddings[3];
+  attr.group = groups;
+  attr.weights = oc;
+  attr.dilation[0] = dilations[0];
+  attr.dilation[1] = dilations[1];
+  attr.pad_type = rk::nn::PadType::AUTO;
+  attr.has_relu = fuse_relu;
+
+  if (is_depthwise_mode) {
+    attr.multiplier = 1;
+  } else {
+    attr.multiplier = 0;
+  }
+
+  auto rGraph = graph->GetHandle();
+  auto conv = rGraph->AddOperator(
+      rk::nn::OperatorType::CONV2D, inputs, outputs, &attr, output_name);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ConvConverter);
diff --git a/lite/kernels/rknpu/bridges/elementwise_ops.cc b/lite/kernels/rknpu/bridges/elementwise_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dbd1f9ccb2a49115a9a0fc6d51ad4537cac253ed
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/elementwise_ops.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+std::vector<int64_t> CvtYShape(const DDim& x_dims,
+                               const DDim& y_dims,
+                               int axis) {
+  CHECK_EQ(x_dims.size(), 4UL) << "[RKNPU] Only support 4-dimension x";
+  CHECK_GE(x_dims.size(), y_dims.size());
+
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+
+  std::vector<int64_t> y_new_shape(y_dims.Vectorize());
+  if (y_new_shape.size() == 4UL) {
+    return y_new_shape;
+  }
+  for (int i = 0; i < axis; i++) {
+    y_new_shape.insert(y_new_shape.begin(), 1);
+  }
+  while (y_new_shape.size() < 4) {
+    y_new_shape.push_back(1);
+  }
+  CHECK_EQ(y_new_shape.size(), 4UL);
+  return y_new_shape;
+}
+
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto y_name = op_info->Input("Y").front();
+  auto y_type = kernel->GetInputDeclType("Y");
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  auto output = scope->FindMutableTensor(out_name);
+  auto axis = op_info->GetAttr<int>("axis");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.clear();
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = op_info->GetAttr<int>("bit_length");
+    }
+    x_node = graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
+  }
+
+  // Y node
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+  } else {
+    // auto y_new_shape = CvtYShape(x_dims, y_dims, axis);
+    // y_node = graph->Add(y_name, *y, y_new_shape);
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.quant_bits = bit_length;
+      qnt.scale.clear();
+      qnt.scale.push_back(input_scale);
+    }
+    y_node = graph->Add(y_name, *y, y_type->precision(), y_type->layout(), qnt);
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.clear();
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(
+      out_name, *output, x_type->precision(), x_type->layout(), output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  inputs.push_back(y_node->data());
+  outputs.push_back(output_node->data());
+
+  auto rGraph = graph->GetHandle();
+
+  // Elementwise node
+  if (op_type == "elementwise_add") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::ADD, inputs, outputs, nullptr);
+  } else if (op_type == "elementwise_sub") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::SUBTRACT, inputs, outputs, nullptr);
+  } else if (op_type == "elementwise_mul") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::MULTIPLY, inputs, outputs, nullptr);
+  } else if (op_type == "elementwise_div") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::DIVIDE, inputs, outputs, nullptr);
+  } else {
+    LOG(WARNING) << "[RKNPU] Unsupported op type: " << op_type;
+    return FAILED;
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_div,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
diff --git a/lite/kernels/rknpu/bridges/fc_op.cc b/lite/kernels/rknpu/bridges/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ef548ed222a69bbc8c116e4146c0a0cea128e81a
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/fc_op.cc
@@ -0,0 +1,247 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  auto input_name = op_info->Input("Input").front();
+  auto input_type = kernel->GetInputDeclType("Input");
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  CHECK_GE(input_dims.size(), 2UL);
+  auto w_name = op_info->Input("W").front();
+  auto w_type = kernel->GetInputDeclType("W");
+  auto w = scope->FindMutableTensor(w_name);
+  auto w_dims = w->dims();
+  CHECK_EQ(w_dims.size(), 2UL);
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  auto output = scope->FindMutableTensor(out_name);
+  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  int m = input_dims.Slice(0, in_num_col_dims).production();
+  int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
+  int n = w_dims[1];
+  CHECK_EQ(k * n, w_dims.production());
+  VLOG(3) << "[RKNPU] input dims: " << input_dims << " w dims: " << w_dims
+          << " m: " << m << " k: " << k << " n: " << n;
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // Create input node and reshape it to (m, k, 1, 1)
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+
+  // Create w const node, set its shape to (n, k) and fill with
+  // the transposed w tensor
+  auto* transpose_w = scope->NewTensor(w_name + "/transpose");
+  std::shared_ptr<Node> trans_w_node = nullptr;
+  transpose_w->Resize({n, k});
+  transpose_w->set_persistable(true);
+
+  if (enable_int8) {
+    QuantizationInfo filter_qnt;
+    auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+    filter_qnt.enable_int8 = enable_int8;
+    filter_qnt.scale = weight_scale;
+    filter_qnt.quant_bits = bit_length;
+
+    auto transpose_w_data = transpose_w->mutable_data<int8_t>();
+    auto w_data = w->mutable_data<int8_t>();
+
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j++) {
+        transpose_w_data[j * k + i] = w_data[i * n + j];
+      }
+    }
+    trans_w_node = graph->Add(
+        w_name, *transpose_w, precision, w_type->layout(), filter_qnt);
+  } else {
+    auto transpose_w_data = transpose_w->mutable_data<float>();
+    auto w_data = w->mutable_data<float>();
+
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j++) {
+        transpose_w_data[j * k + i] = w_data[i * n + j];
+      }
+    }
+    trans_w_node =
+        graph->Add(w_name, *transpose_w, precision, w_type->layout());
+  }
+
+  // Add bias node if bias tensor exists
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      std::vector<int64_t> bias_shape = {n};
+
+      VLOG(3) << "[RKNPU] bias precision: "
+              << PrecisionToStr(bias->precision());
+      // We need to quantize bias
+      if (enable_int8) {
+        auto bias_name_qnt = bias_name + "/qnt";
+        auto* bias_qnt = scope->NewTensor(bias_name_qnt);
+        auto weight_scale =
+            op_info->GetAttr<std::vector<float>>("weight_scale");
+
+        bias_qnt->Resize(bias_shape);
+        bias_qnt->set_persistable(true);
+        bias_qnt->set_precision(PrecisionType::kInt32);
+
+        auto* bias_qnt_data = bias_qnt->mutable_data<int32_t>();
+        auto* bias_data = bias->mutable_data<float>();
+
+        QuantizationInfo qnt;
+        qnt.enable_int8 = enable_int8;
+        qnt.quant_bits = 32;
+        qnt.scale.resize(weight_scale.size());
+
+        for (int i = 0; i < weight_scale.size(); i++) {
+          qnt.scale[i] = input_scale * weight_scale[i];
+        }
+
+        auto dtype_max = static_cast<int>((1 << (qnt.quant_bits - 1)) - 1);
+        auto dtype_min = static_cast<int>(0 - dtype_max);
+
+        for (int i = 0; i < n; i++) {
+          bias_qnt_data[i] =
+              std::min(std::max(static_cast<int>(bias_data[i] / qnt.scale[i]),
+                                dtype_min),
+                       dtype_max);
+        }
+
+        bias_node = graph->Add(
+            bias_name, *bias_qnt, bias_qnt->precision(), layout, qnt);
+      } else {
+        bias_node = graph->Add(bias_name, *bias, bias_shape);
+      }
+    }
+  } else {
+    auto bias_name = w_name + "/bias/dummy";
+    auto* bias = scope->NewTensor(bias_name);
+    std::vector<int64_t> bias_shape = {n};
+
+    bias->Resize(bias_shape);
+    bias->set_persistable(true);
+
+    if (enable_int8) {
+      auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+      bias->set_precision(PrecisionType::kInt32);
+      auto* bias_data = bias->mutable_data<int32_t>();
+
+      for (int i = 0; i < n; i++) {
+        bias_data[i] = 0;
+      }
+
+      QuantizationInfo qnt;
+      qnt.enable_int8 = enable_int8;
+      qnt.quant_bits = 32;
+      qnt.scale.resize(weight_scale.size());
+
+      for (int i = 0; i < weight_scale.size(); i++) {
+        qnt.scale[i] = input_scale * weight_scale[i];
+      }
+
+      bias_node = graph->Add(bias_name, *bias, bias->precision(), layout, qnt);
+    } else {
+      bias->set_precision(PrecisionType::kFloat);
+      auto* bias_data = bias->mutable_data<float>();
+
+      for (int i = 0; i < n; i++) {
+        bias_data[i] = 0.0;
+      }
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
+    }
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.clear();
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(input_node->data());
+  inputs.push_back(trans_w_node->data());
+  inputs.push_back(bias_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::FCAttr attrs;
+  attrs.weights = n;
+  attrs.has_relu = false;
+
+  auto rGraph = graph->GetHandle();
+  auto fc = rGraph->AddOperator(
+      rk::nn::OperatorType::FULLCONNECT, inputs, outputs, &attrs);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(fc,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::FCConverter);
diff --git a/lite/kernels/rknpu/bridges/graph.cc b/lite/kernels/rknpu/bridges/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c1297c2e7e14d2138e05c4949573fd1db7cc235
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/graph.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include <rknpu/graph.h>
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    // Only variable node can be shared with the same name
+    if (!node->is_var() || !it->second.back()->is_var()) {
+      LOG(FATAL) << "[RKNPU] Const or data node " << name << " is redefined.";
+      return -1;
+    }
+  } else {
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  it->second.push_back(node);
+  return it->second.size();
+}
+
+// Const or data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const Tensor& tensor,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout,
+                                 const QuantizationInfo& qnt) {
+  std::shared_ptr<Node> node = nullptr;
+
+  if (precision == PrecisionType::kUnk) {
+    precision = tensor.precision();  // todo
+  }
+
+  if (precision == PrecisionType::kUnk) {
+    if (qnt.enable_int8 && qnt.quant_bits == 8) {
+      precision = PrecisionType::kInt8;
+    } else if (!qnt.enable_int8) {
+      precision = PrecisionType::kFloat;
+    } else {
+      LOG(ERROR) << "[rknpu]:Graph:: tensor precision unknown!";
+    }
+  }
+
+  if (precision != tensor.precision()) {
+    LOG(INFO) << "[rknpu]:Graph::Add: tensor precision mismatch!" << name << ":"
+              << PrecisionToStr(precision) << " vs "
+              << PrecisionToStr(tensor.precision());
+  }
+
+  if (tensor.persistable()) {
+    // Const node
+    node = std::make_shared<Node>(precision, layout, Node::Role::kConst);
+    auto idx = Add(name, node);
+    CHECK_EQ(idx, 1);
+    auto attr = std::make_shared<rk::nn::TensorAttr>();
+    attr->precision = ToRknpuPrecisionType(precision);
+    attr->layout = ToRknpuDataLayoutType(layout);
+    attr->role = rk::nn::TensorRole::CONST;
+    attr->name = name;
+
+    switch (precision) {
+      case PrecisionType::kInt8:
+        attr->qntBits = 8;
+        attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+        attr->qntParamSymmetric.scale = qnt.scale;
+        break;
+      case PrecisionType::kInt32:
+        attr->qntBits = 32;
+        attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+        attr->qntParamSymmetric.scale = qnt.scale;
+        break;
+      default:
+        break;
+    }
+
+    attr->dims.resize(shape.size());
+    for (int i = 0; i < shape.size(); i++) {
+      attr->dims[i] = shape[i];
+    }
+
+    LOG(INFO) << "[rknpu]:Graph::Add const node:" << name
+              << " precision: " << PrecisionToStr(precision)
+              << " layout: " << DataLayoutToStr(layout);
+    node->set_data(
+        rgraph_->CreateTensor(attr, const_cast<void*>(tensor.raw_data())));
+  } else {
+    // Data node
+    node = Add(name, shape, precision, layout, qnt);
+  }
+  return node;
+}
+
+// Data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout,
+                                 const QuantizationInfo& qnt) {
+  auto node = std::make_shared<Node>(precision, layout, Node::Role::kData);
+  auto idx = Add(name, node);
+  CHECK_EQ(idx, 1);
+  auto attr = std::make_shared<rk::nn::TensorAttr>();
+  attr->precision = ToRknpuPrecisionType(precision);
+  attr->layout = ToRknpuDataLayoutType(layout);
+  attr->role = rk::nn::TensorRole::VAR;
+  attr->name = name;
+
+  switch (precision) {
+    case PrecisionType::kInt8:
+      attr->qntBits = 8;
+      attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+      attr->qntParamSymmetric.scale = qnt.scale;
+      break;
+    case PrecisionType::kInt32:
+      attr->qntBits = 32;
+      attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+      attr->qntParamSymmetric.scale = qnt.scale;
+      break;
+
+    default:
+      break;
+  }
+
+  attr->dims.resize(shape.size());
+  for (int i = 0; i < shape.size(); i++) {
+    attr->dims[i] = shape[i];
+  }
+
+  LOG(INFO) << "[rknpu]:Graph::Add data node:" << name
+            << " precision: " << PrecisionToStr(precision)
+            << " layout: " << DataLayoutToStr(layout);
+  node->set_data(rgraph_->CreateTensor(attr, nullptr));  // todo
+  return node;
+}
+
+Graph::Graph() {
+  rgraph_ = new rk::nn::Graph();
+  CHECK(rgraph_ != nullptr);
+}
+
+Graph::~Graph() {}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/rknpu/bridges/graph.h b/lite/kernels/rknpu/bridges/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..a106d282de9e2c13f422dd5d8bd736968741a6d6
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/graph.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+// Graph and node is defined to collect all of converted RKNPU IR nodes
+struct QuantizationInfo {
+  int enable_int8;
+  int quant_bits;
+  std::vector<float> scale;
+};
+
+class Node {
+ public:
+  enum class Role {
+    kVar = 0,
+    kConst,
+    kData,
+  };
+
+  Node(std::shared_ptr<rk::nn::Tensor> data,
+       PrecisionType precision,
+       DataLayoutType layout,
+       Role role)
+      : data_(data), precision_(precision), layout_(layout), role_(role) {}
+  Node(PrecisionType precision, DataLayoutType layout, Role role)
+      : precision_(precision), layout_(layout), role_(role) {}
+
+  void set_data(std::shared_ptr<rk::nn::Tensor> data) { data_ = data; }
+  void set_precision(PrecisionType precision) { precision_ = precision; }
+  void set_layout(DataLayoutType layout) { layout_ = layout; }
+  void set_role(Role role) { role_ = role; }
+  void set_quant_param(const QuantizationInfo& qnt) { qnt_ = qnt; }
+
+  std::shared_ptr<rk::nn::Tensor> data() { return data_; }
+  PrecisionType precision() const { return precision_; }
+  DataLayoutType layout() const { return layout_; }
+  Role role() const { return role_; }
+  bool is_var() const { return role_ == Role::kVar; }
+  bool is_const() const { return role_ == Role::kConst; }
+  bool is_data() const { return role_ == Role::kData; }
+
+ private:
+  std::shared_ptr<rk::nn::Tensor> data_{nullptr};
+  PrecisionType precision_{PRECISION(kFloat)};
+  DataLayoutType layout_{DATALAYOUT(kNCHW)};
+  Role role_{Role::kVar};
+  QuantizationInfo qnt_;
+};
+
+class Graph {
+ public:
+  Graph();
+  ~Graph();
+
+ public:
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kUnk),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo());
+  std::shared_ptr<Node> Get(const std::string& name) {
+    CHECK(Has(name)) << "[RKNPU] Node " << name << " not found.";
+    return nodes_.at(name).back();
+  }
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            PrecisionType precision = PRECISION(kUnk),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo()) {
+    return Add(name, tensor, tensor.dims().Vectorize(), precision, layout, qnt);
+  }
+
+  // Data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo());
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            DDim dims,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo()) {
+    return Add(name, dims.Vectorize(), precision, layout, qnt);
+  }
+
+  bool Has(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+  rk::nn::Graph* GetHandle() { return rgraph_; }
+
+ private:
+  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
+  rk::nn::Graph* rgraph_;
+};
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/rknpu/bridges/paddle_use_bridges.h b/lite/kernels/rknpu/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..e63033bfcc01ba66e0b01c01aedd15319a3968ce
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/paddle_use_bridges.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(relu, kRKNPU);
+USE_SUBGRAPH_BRIDGE(conv2d, kRKNPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kRKNPU);
+
+USE_SUBGRAPH_BRIDGE(pool2d, kRKNPU);
+USE_SUBGRAPH_BRIDGE(fc, kRKNPU);
+USE_SUBGRAPH_BRIDGE(softmax, kRKNPU);
+USE_SUBGRAPH_BRIDGE(batch_norm, kRKNPU);
+USE_SUBGRAPH_BRIDGE(concat, kRKNPU);
+
+USE_SUBGRAPH_BRIDGE(elementwise_add, kRKNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kRKNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kRKNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_div, kRKNPU);
diff --git a/lite/kernels/rknpu/bridges/pool_op.cc b/lite/kernels/rknpu/bridges/pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d6f8e11e57f0528acdc8ef526186e56a2f5545d
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/pool_op.cc
@@ -0,0 +1,187 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pool_op.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto out_name = op_info->Output("Out").front();
+  auto output = scope->FindMutableTensor(out_name);
+
+  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (x->precision() == PRECISION(kInt8)) {
+    // enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    enable_int8 = true;
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+      LOG(WARNING) << "[RKNPU] Pooling int8";
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = bit_length;
+    }
+    x_node = graph->Add(x_name, *x, x->precision(), layout, qnt);
+  }
+
+  // pool mode
+  rk::nn::PoolType mode = rk::nn::PoolType::POOLING_UNKNOWN;
+  if (pooling_type == "max") {
+    mode = rk::nn::PoolType::POOLING_MAX;
+  } else if (pooling_type == "avg") {
+    mode = rk::nn::PoolType::POOLING_AVG;
+  } else {
+    LOG(WARNING) << "[RKNPU] Unsupported pooling type: " << pooling_type;
+    return FAILED;
+  }
+
+  // pad mode
+  rk::nn::PadType pad_mode = rk::nn::PadType::AUTO;
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  if (padding_algorithm == "SAME") {
+    pad_mode = rk::nn::PadType::SAME;
+  } else if (padding_algorithm == "VALID") {
+    pad_mode = rk::nn::PadType::VALID;
+  }
+
+  // paddings and strides
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[NPU] Paddings size should be the same or twice as the inputs size.";
+
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  lite::operators::UpdatePadding(&paddings,
+                                 global_pooling,
+                                 adaptive,
+                                 padding_algorithm,
+                                 x->dims(),
+                                 strides,
+                                 ksize);
+
+  // ceil mode
+  int ceil_mode = 0;
+  if (op_info->HasAttr("ceil_mode")) {
+    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::PoolAttr attrs;
+  attrs.ksize[0] = ksize[0];
+  attrs.ksize[1] = ksize[1];
+  attrs.stride[0] = strides[0];
+  attrs.stride[1] = strides[1];
+  attrs.pad[0] = paddings[0];
+  attrs.pad[1] = paddings[1];
+  attrs.pad[2] = paddings[2];
+  attrs.pad[3] = paddings[3];
+  attrs.pad_type = pad_mode;
+  attrs.pool_type = mode;
+  attrs.global_pooling = global_pooling;
+
+  if (ceil_mode) {
+    attrs.round_type = rk::nn::RoundType::ROUND_CEIL;
+  } else {
+    attrs.round_type = rk::nn::RoundType::ROUND_FLOOR;
+  }
+
+  auto rGraph = graph->GetHandle();
+  auto pool =
+      rGraph->AddOperator(rk::nn::OperatorType::POOL, inputs, outputs, &attrs);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::PoolConverter);
diff --git a/lite/kernels/rknpu/bridges/softmax_op.cc b/lite/kernels/rknpu/bridges/softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ec0b9c7462526f0409a634159d17d5afbd795f5
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/softmax_op.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto x_rank = x_dims.size();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto output = scope->FindMutableTensor(out_name);
+  auto axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis += x_rank;
+  }
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = bit_length;
+    }
+    x_node = graph->Add(x_name, *x, precision, layout, qnt);
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::SoftmaxAttr attrs;
+  attrs.axis = axis;
+  attrs.beta = 1.0;
+
+  auto rGraph = graph->GetHandle();
+  auto softmax = rGraph->AddOperator(
+      rk::nn::OperatorType::SOFTMAX, inputs, outputs, &attrs);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::SoftmaxConverter);
diff --git a/lite/kernels/rknpu/bridges/utility.cc b/lite/kernels/rknpu/bridges/utility.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df236951ff1c4ede5fed11286fa7547903611fb4
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/utility.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/rknpu/bridges/utility.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+rk::nn::PrecisionType ToRknpuPrecisionType(PrecisionType precision) {
+  rk::nn::PrecisionType t = rk::nn::PrecisionType::UNKNOWN;
+
+  switch (precision) {
+    case PrecisionType::kFloat:
+      t = rk::nn::PrecisionType::FLOAT32;
+      break;
+    case PrecisionType::kFP16:
+      t = rk::nn::PrecisionType::FLOAT16;
+      break;
+    case PrecisionType::kInt16:
+      t = rk::nn::PrecisionType::INT16;
+      break;
+    case PrecisionType::kInt32:
+      t = rk::nn::PrecisionType::INT32;
+      break;
+    case PrecisionType::kInt64:
+      t = rk::nn::PrecisionType::INT64;
+      break;
+    case PrecisionType::kInt8:
+      t = rk::nn::PrecisionType::INT8;
+      break;
+    case PrecisionType::kBool:
+      t = rk::nn::PrecisionType::BOOL8;
+      break;
+    default:
+      break;
+  }
+
+  return t;
+}
+
+rk::nn::DataLayoutType ToRknpuDataLayoutType(DataLayoutType layout) {
+  rk::nn::DataLayoutType t = rk::nn::DataLayoutType::UNKNOWN;
+
+  switch (layout) {
+    case DataLayoutType::kNCHW:
+      t = rk::nn::DataLayoutType::NCHW;
+      break;
+    case DataLayoutType::kNHWC:
+      t = rk::nn::DataLayoutType::NHWC;
+      break;
+    default:
+      break;
+  }
+
+  return t;
+}
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/rknpu/bridges/utility.h b/lite/kernels/rknpu/bridges/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e8e5b5c97cbb00e784b7cbecf25e7238d271520
--- /dev/null
+++ b/lite/kernels/rknpu/bridges/utility.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+rk::nn::PrecisionType ToRknpuPrecisionType(PrecisionType precision);
+rk::nn::DataLayoutType ToRknpuDataLayoutType(DataLayoutType layout);
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/rknpu/subgraph_compute.cc b/lite/kernels/rknpu/subgraph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0b63205705609b6899918ce8e254ccdf6cbad47
--- /dev/null
+++ b/lite/kernels/rknpu/subgraph_compute.cc
@@ -0,0 +1,239 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/rknpu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/backends/rknpu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+#include "rknpu/rknpu_pub.h"  // NOLINT
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace rknpu {
+
+int SubgraphEngine::BuildDeviceProgram() {
+  LOG(INFO) << "[RKNPU]:BuildDeviceProgram";
+  int status = 0;
+  // Convert all of ops and their input vars and weights and added into the NPU
+  // RKNPU IR graph
+  subgraph::rknpu::Graph graph;
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = const_cast<OpLite*>(inst.op());
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kRKNPU))) {
+      return subgraph::FAILED;
+    }
+    auto kernel = inst.kernel();
+    status |= bridges.Select(op_type, TARGET(kRKNPU))(
+        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  // Collect the valid input and output nodes in the RKNPU IR graph and update
+  // the input and output names
+  device_inames_.clear();
+  device_onames_.clear();
+
+  for (auto& input_name : input_names_) {
+    LOG(INFO) << "[RKNPU] Input node " << input_name;
+    if (graph.Has(input_name)) {
+      LOG(INFO) << input_name << " Precision "
+                << PrecisionToStr(graph.Get(input_name)->precision());
+      device_itensors_.push_back(graph.Get(input_name)->data());
+      device_inames_.push_back(input_name);
+    } else {
+      LOG(WARNING) << "[RKNPU] Input node " << input_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+
+  for (auto& output_name : output_names_) {
+    LOG(INFO) << "[RKNPU] Output node " << output_name;
+    if (graph.Has(output_name)) {
+      auto tensor = scope_->FindMutableTensor(output_name);
+      LOG(INFO) << output_name << " Precision "
+                << PrecisionToStr(tensor->precision());
+      device_otensors_.push_back(graph.Get(output_name)->data());
+      device_onames_.push_back(output_name);
+    } else {
+      LOG(WARNING) << "[RKNPU] Output node " << output_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+  CHECK(!device_inames_.empty())
+      << "[RKNPU] No input nodes found for building NPU model";
+  CHECK(!device_onames_.empty())
+      << "[RKNPU] No output nodes found for building NPU model";
+
+  device_program_ = lite::rknpu::Device::Global().Build(
+      model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[RKNPU] Build model failed!";
+    return subgraph::FAILED;
+  }
+
+  // input
+  origin_idims_.resize(input_names_.size());
+  origin_itensors_.resize(input_names_.size());
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+  }
+  // output
+  origin_odims_.resize(output_names_.size());
+  origin_otensors_.resize(output_names_.size());
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+
+    auto output_dims = origin_otensors_[i]->dims();
+  }
+
+  origin_idims_.resize(device_inames_.size());
+  origin_itensors_.resize(device_inames_.size());
+  device_itensors_.resize(device_inames_.size());
+  origin_odims_.resize(device_onames_.size());
+  origin_otensors_.resize(device_onames_.size());
+  device_otensors_.resize(device_onames_.size());
+  for (int i = 0; i < device_inames_.size(); i++) {
+    auto node = graph.Get(device_inames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+
+    LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << device_inames_[i]
+              << " precision: " << PrecisionToStr(precision)
+              << " layout: " << DataLayoutToStr(layout);
+  }
+  for (int i = 0; i < device_onames_.size(); i++) {
+    auto node = graph.Get(device_onames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << device_onames_[i]
+              << " precision: " << PrecisionToStr(precision)
+              << " layout: " << DataLayoutToStr(layout);
+    // Prepare the device output tensors
+    switch (precision) {
+      case PRECISION(kFloat):
+        origin_otensors_[i]->mutable_data<float>();
+        break;
+      case PRECISION(kInt8):
+        origin_otensors_[i]->mutable_data<int8_t>();
+        break;
+      case PRECISION(kInt16):
+        origin_otensors_[i]->mutable_data<int16_t>();
+        break;
+      case PRECISION(kInt32):
+        origin_otensors_[i]->mutable_data<int32_t>();
+        break;
+      case PRECISION(kInt64):
+        origin_otensors_[i]->mutable_data<int64_t>();
+        break;
+      default:
+        LOG(FATAL) << "[RKNPU] " << device_onames_[i]
+                   << " can't mutable data with precision type "
+                   << PrecisionToStr(precision);
+        break;
+    }
+  }
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  LOG(INFO) << "[RKNPU]:LaunchDeviceProgram";
+  std::vector<rk::nn::InputInfo> inputs;
+  std::vector<rk::nn::OutputInfo> outputs;
+
+  inputs.resize(device_itensors_.size());
+  for (size_t i = 0; i < device_itensors_.size(); i++) {
+    inputs[i].index = i;
+    inputs[i].buf = const_cast<void*>(origin_itensors_[i]->raw_data());
+    inputs[i].size = origin_itensors_[i]->memory_size();
+    inputs[i].pass_through = false;
+    inputs[i].type =
+        subgraph::rknpu::ToRknpuPrecisionType(origin_itensors_[i]->precision());
+    inputs[i].layout = rk::nn::DataLayoutType::NCHW;
+  }
+
+  outputs.resize(device_otensors_.size());
+  for (size_t i = 0; i < device_otensors_.size(); i++) {
+    outputs[i].index = i;
+    outputs[i].buf = const_cast<void*>(origin_otensors_[i]->raw_data());
+    outputs[i].size = origin_otensors_[i]->memory_size();
+    outputs[i].want_float = false;
+  }
+
+  device_program_->SetInputs(inputs);
+  device_program_->Run();
+  device_program_->GetOutputs(outputs);
+  return 0;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  LOG(INFO) << "[RKNPU]:PrepareForRun";
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  LOG(INFO) << "[RKNPU]:Run";
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace rknpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kRKNPU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::rknpu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/rknpu/subgraph_compute.h b/lite/kernels/rknpu/subgraph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..863e6aef39ad54f0e9d94d4b507c6fca4128ebb8
--- /dev/null
+++ b/lite/kernels/rknpu/subgraph_compute.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/program.h"
+#include "lite/core/types.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace rknpu {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+
+  std::string model_name_;
+  std::vector<std::string> device_inames_;
+  std::vector<std::string> device_onames_;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_;
+  std::unique_ptr<rk::nn::Exection> device_program_{nullptr};
+};
+
+class SubgraphCompute
+    : public KernelLite<TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace rknpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt
index 2036a343d722d5c01a4b9dcd0d4cdf682a92d218..bbc67a242ce294585888b2afb51798f38fe7f0b1 100644
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -24,7 +24,11 @@ add_kernel(stack_compute_x86 X86 basic SRCS stack_compute.cc DEPS ${lite_kernel_
 add_kernel(dropout_compute_x86 X86 basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(transpose_compute_x86 X86 basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_function)
 add_kernel(layer_norm_compute_x86 X86 basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
-add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
+# todo: fc x86 kernel can not compile successfully on mac because openmp is not supported on mac clang,
+# this problem should be fixed later to support fc x86 kernel on mac. @DannyIsFunny
+if(NOT APPLE)
+    add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
+endif()
 # lite_cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(uniform_random_compute_x86 SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps} )
 add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute)
diff --git a/lite/kernels/x86/activation_compute.h b/lite/kernels/x86/activation_compute.h
index 65d270e02fab902a1dfa92ddf27de040ef43a1b9..520adaf44f808748c75960f88cd07799c9f2d4ed 100644
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
@@ -16,6 +16,12 @@
 #include <algorithm>
 #include <utility>
 #include <vector>
+
+#include <cmath>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
 #include "lite/backends/x86/math/blas.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
diff --git a/lite/kernels/x86/elementwise_op_function.h b/lite/kernels/x86/elementwise_op_function.h
index 42ea38d979e39f97a8aef971370c83303c53c48f..c49f21d1a8ee20db249274874e21accd00dfbcd1 100644
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
@@ -64,14 +64,14 @@ inline void get_mid_dims(const lite::DDim &x_dims,
     for (int i = 0; i < axis; ++i) {
       (*pre) *= x_dims[i];
     }
-    for (int i = 0; i < y_dims.size(); ++i) {
+    for (size_t i = 0; i < y_dims.size(); ++i) {
       if (x_dims[i + axis] != y_dims[i]) {
         // only support single y_dims[i] = 1 now.
         PADDLE_ENFORCE_EQ(
             *mid_flag, 0, "Broadcast support y_dims with single 1.");
         PADDLE_ENFORCE_EQ(y_dims[i], 1, "Broadcast dimension mismatch.");
         // m*n*k m*1*k
-        for (int j = 0; j < i; ++j) {
+        for (size_t j = 0; j < i; ++j) {
           (*pre) *= y_dims[j];
         }
         *n = std::max(x_dims[i + axis], y_dims[i]);
@@ -82,11 +82,11 @@ inline void get_mid_dims(const lite::DDim &x_dims,
       (*n) *= y_dims[i];
     }
     if (*mid_flag) {
-      for (int i = mid + 1; i < x_dims.size(); ++i) {
+      for (size_t i = mid + 1; i < x_dims.size(); ++i) {
         (*post) *= x_dims[i];
       }
     } else {
-      for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+      for (size_t i = axis + y_dims.size(); i < x_dims.size(); ++i) {
         (*post) *= x_dims[i];
       }
     }
@@ -95,13 +95,13 @@ inline void get_mid_dims(const lite::DDim &x_dims,
       (*pre) *= x_dims[i];
     }
 
-    for (int i = 0; i < y_dims.size(); ++i) {
+    for (size_t i = 0; i < y_dims.size(); ++i) {
       PADDLE_ENFORCE_EQ(
           x_dims[i + axis], y_dims[i], "Broadcast dimension mismatch.");
       (*n) *= y_dims[i];
     }
 
-    for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    for (size_t i = axis + y_dims.size(); i < x_dims.size(); ++i) {
       (*post) *= x_dims[i];
     }
   }
@@ -116,7 +116,7 @@ inline lite::DDim trim_trailing_singular_dims(const lite::DDim &dims) {
 
   std::vector<int64_t> trim_dims;
   trim_dims.resize(actual_dims_size);
-  for (int i = 0; i < actual_dims_size; ++i) {
+  for (size_t i = 0; i < actual_dims_size; ++i) {
     trim_dims[i] = dims[i];
   }
   if (trim_dims.size() == 0) {
diff --git a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
index e324b599836059ca3560593950f689eabd393ea0..16bec18a1c1c4d0075e1ed1dcc4f3a3462917868 100644
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
@@ -71,7 +71,7 @@ TEST(fill_constant_batch_size_like_x86, run_test) {
 
   std::vector<float> ref_results{
       3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5};
-  for (int i = 0; i < ref_results.size(); i++) {
+  for (size_t i = 0; i < ref_results.size(); i++) {
     EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
   }
 }
diff --git a/lite/kernels/x86/gather_compute.h b/lite/kernels/x86/gather_compute.h
index bd01d9da3af1640770838c262dcd848b557d40c3..e63332e87a079e234a0fc72ee2756afd2ebdd94c 100644
--- a/lite/kernels/x86/gather_compute.h
+++ b/lite/kernels/x86/gather_compute.h
@@ -56,7 +56,7 @@ void CPUGather(const lite::Tensor* src,
 
   // slice size
   int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+  for (size_t i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
   const size_t slice_bytes = slice_size * sizeof(T);
   for (int64_t i = 0; i < index_size; ++i) {
diff --git a/lite/kernels/x86/layer_norm_compute_test.cc b/lite/kernels/x86/layer_norm_compute_test.cc
index a5244bcc6f2c561b5eac2fc74b1cc8c5f12417d6..d39500a5e8827230ddeecd6bbe30f8c0a47ee929 100644
--- a/lite/kernels/x86/layer_norm_compute_test.cc
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
@@ -108,7 +108,7 @@ TEST(layer_norm_x86, run_test) {
   for (int i = 0; i < begin_norm_axis; ++i) {
     pre *= x_shape[i];
   }
-  for (int i = begin_norm_axis; i < x_shape.size(); ++i) {
+  for (size_t i = begin_norm_axis; i < x_shape.size(); ++i) {
     post *= x_shape[i];
   }
   std::vector<int64_t> scale_shape({post});
diff --git a/lite/kernels/x86/sequence_expand_as_compute.h b/lite/kernels/x86/sequence_expand_as_compute.h
index badbfac14cbeb120d23ea1174a9fc3a218b2224f..4ab1aeae02f5f54d2f9542520b4acda57c89eec2 100644
--- a/lite/kernels/x86/sequence_expand_as_compute.h
+++ b/lite/kernels/x86/sequence_expand_as_compute.h
@@ -66,8 +66,8 @@ class SequenceExpandAsCompute
     auto *out = param.out;
 
     auto &y_lod = y->lod();
-    CHECK_EQ(y_lod.size(), 1);
-    CHECK_GT(y_lod[0].size(), 1);
+    CHECK_EQ(y_lod.size(), 1u);
+    CHECK_GT(y_lod[0].size(), 1u);
 
     out->template mutable_data<T, T>();
 
diff --git a/lite/kernels/x86/sequence_reverse_compute_test.cc b/lite/kernels/x86/sequence_reverse_compute_test.cc
index 4b84241c8b19e3db57dd7ef6339496191a7486be..adf9981b242bfbb7f60989369715354cc2043685 100644
--- a/lite/kernels/x86/sequence_reverse_compute_test.cc
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
@@ -30,7 +30,7 @@ static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) {
   auto seq_offset = x->lod()[x->lod().size() - 1];
   int width = x->numel() / x->dims()[0];
   auto* y_data = y->mutable_data<float>();
-  for (int i = 0; i < seq_offset.size() - 1; ++i) {
+  for (size_t i = 0; i < seq_offset.size() - 1; ++i) {
     auto start_pos = seq_offset[i];
     auto end_pos = seq_offset[i + 1];
     for (auto pos = start_pos; pos < end_pos; ++pos) {
diff --git a/lite/kernels/x86/shape_compute.h b/lite/kernels/x86/shape_compute.h
index e78684e629727fc7023e6ae4c3385f9c58d48a6b..bceae79ea17665c02981666aef0cdf7827f0c45d 100644
--- a/lite/kernels/x86/shape_compute.h
+++ b/lite/kernels/x86/shape_compute.h
@@ -31,7 +31,7 @@ class ShapeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     // auto& context = context_->As<X86Context>();
     auto out_data = param.Out->template mutable_data<int32_t>();
     auto in_dims = param.X->dims();
-    for (int i = 0; i < in_dims.size(); ++i) {
+    for (size_t i = 0; i < in_dims.size(); ++i) {
       out_data[i] = in_dims[i];
     }
   }
diff --git a/lite/kernels/x86/slice_compute.h b/lite/kernels/x86/slice_compute.h
index 0bb9fd66eb60ca1df698dbe806bc8e9ee2a69f0e..ad30215691cde66ab1c7c8c57930fc6d58de7cd5 100644
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
@@ -118,7 +118,7 @@ void slice_compute(const lite::Tensor* in,
         out_dims[decrease_axis[i]] = 0;
       }
 
-      for (int i = 0; i < out_dims.size(); ++i) {
+      for (size_t i = 0; i < out_dims.size(); ++i) {
         if (out_dims[i] != 0) {
           new_out_shape.push_back(out_dims[i]);
         }
diff --git a/lite/kernels/x86/slice_compute_test.cc b/lite/kernels/x86/slice_compute_test.cc
index 8d35534f824504a965f8ded0ef82878c03739a36..a69bfc9a43c3a83f52dab8e2752921be1069252b 100644
--- a/lite/kernels/x86/slice_compute_test.cc
+++ b/lite/kernels/x86/slice_compute_test.cc
@@ -34,10 +34,10 @@ static void slice_ref(const float* input,
   std::vector<int> real_starts(in_dims.size(), 0);
   std::vector<int> real_ends(in_dims.size(), 0);
   std::vector<int> real_step(in_dims.size(), 0);
-  for (int i = 0; i < in_dims.size(); i++) {
+  for (size_t i = 0; i < in_dims.size(); i++) {
     real_ends[i] = in_dims[i];
   }
-  for (int i = 0; i < axes.size(); i++) {
+  for (size_t i = 0; i < axes.size(); i++) {
     int dim_value = in_dims[axes[i]];
     if (dim_value > 0) {
       int start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
@@ -52,11 +52,11 @@ static void slice_ref(const float* input,
   }
   const int LEN = in_dims.size();
   int dst_step[LEN];
-  for (int i = 0; i < in_dims.size(); ++i) {
+  for (size_t i = 0; i < in_dims.size(); ++i) {
     dst_step[i] = 1;
   }
   int src_step[LEN];
-  for (int i = 0; i < in_dims.size(); ++i) {
+  for (size_t i = 0; i < in_dims.size(); ++i) {
     src_step[i] = 1;
   }
   int out_num = out_dims[in_dims.size() - 1];
@@ -69,7 +69,7 @@ static void slice_ref(const float* input,
   for (int dst_id = 0; dst_id < out_num; dst_id++) {
     int src_id = 0;
     int index_id = dst_id;
-    for (int j = 0; j < out_dims.size(); j++) {
+    for (size_t j = 0; j < out_dims.size(); j++) {
       int cur_id = index_id / dst_step[j];
       index_id = index_id % dst_step[j];
       src_id += (cur_id + real_starts[j]) * src_step[j];
@@ -409,7 +409,7 @@ void test_tensor_case3(lite::Tensor x, lite::Tensor out) {
   lite::Tensor starts_tensor, ends_tensor;
   starts_tensor.Resize(DDim({3}));
   ends_tensor.Resize(DDim({3}));
-  for (int i = 0; i < starts.size(); ++i) {
+  for (size_t i = 0; i < starts.size(); ++i) {
     starts_tensor.mutable_data<int>()[i] = starts[i];
     ends_tensor.mutable_data<int>()[i] = ends[i];
   }
diff --git a/lite/kernels/x86/stack_compute.h b/lite/kernels/x86/stack_compute.h
index 08b3515948750a5cb36627f0349c852e597619e6..6921430224a77adad0150e271ca634433700e5d6 100644
--- a/lite/kernels/x86/stack_compute.h
+++ b/lite/kernels/x86/stack_compute.h
@@ -47,7 +47,7 @@ class StackCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     int pre = 1, post = 1;
     auto dim = x[0]->dims();
     for (int i = 0; i < axis; ++i) pre *= dim[i];
-    for (int i = axis; i < dim.size(); ++i) post *= dim[i];
+    for (size_t i = axis; i < dim.size(); ++i) post *= dim[i];
 
     auto x_data_arr = x_datas.data();
 
diff --git a/lite/kernels/x86/var_conv_2d_compute.h b/lite/kernels/x86/var_conv_2d_compute.h
index 1bed39f479c87636ff217c8fd7234ea2c5bd5904..36361340ae2b2a32604fe59f3bc73c785a89028f 100644
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
@@ -44,7 +44,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     // 2-D lod info.
     // const auto& offset_x = in_col->lod()[0];
     // const auto& offset_y = in_row->lod()[0];
-    CHECK_EQ(param.X->lod().size(), 3) << "input lod size should be 3!";
+    CHECK_EQ(param.X->lod().size(), 3u) << "input lod size should be 3!";
     const auto& offset_y = param.X->lod()[1];
     const auto& offset_x = param.X->lod()[2];
 
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index 7f938577c3b2e53257d4fb79686a0bf8c6a67ad5..8bf3f87c613de261b5c4da9a1ab55c4378082864 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -253,7 +253,7 @@ void LoadModelPb(const std::string &model_dir,
       std::string file_path = model_dir + "/" + var.name();
       VLOG(4) << "reading weight " << var.name();
 
-      std::ifstream file(file_path);
+      std::ifstream file(file_path, std::ios::binary);
       switch (var.type().type()) {
         case framework::proto::VarType_Type_LOD_TENSOR:
           LoadLoDTensor(file, scope->Var(var.name()));
diff --git a/lite/model_parser/model_parser_test.cc b/lite/model_parser/model_parser_test.cc
index d9c0f501c37862236cacd2624dc70c8cf1dacc86..16794a525142ad1ad76695dd4aaac003cba32daa 100644
--- a/lite/model_parser/model_parser_test.cc
+++ b/lite/model_parser/model_parser_test.cc
@@ -107,7 +107,7 @@ TEST(ModelParser, LoadParamNaive) {
   ASSERT_EQ(bg_lod, tensor.lod());
   ASSERT_EQ(tensor.data_size(), size);
   auto* data = tensor.data<float>();
-  for (int i = 0; i < size; ++i) {
+  for (size_t i = 0; i < size; ++i) {
     EXPECT_NEAR(bg_data[i], data[i], 1e-6);
   }
 }
diff --git a/lite/model_parser/naive_buffer/naive_buffer.h b/lite/model_parser/naive_buffer/naive_buffer.h
index 5be17856a25aabfed81ae88d80e788c8dd2be4bc..5fd1b59e151cf834f87aa4e505be029b2b0d899a 100644
--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ b/lite/model_parser/naive_buffer/naive_buffer.h
@@ -192,7 +192,7 @@ class EnumBuilder : public FieldBuilder {
 
   ~EnumBuilder() = default;
 
-  Type type() const override { return Type::_enum; }
+  Type type() const override { return Type::ENUM; }
 };
 
 class StringBuilder : public FieldBuilder {
@@ -211,7 +211,7 @@ class StringBuilder : public FieldBuilder {
 
   void Load() override;
 
-  Type type() const override { return Type::_string; }
+  Type type() const override { return Type::STRING; }
 };
 
 /*
@@ -266,7 +266,7 @@ class StructBuilder : public FieldBuilder {
 
   /// Type of this struct.
   // TODO(Superjomn) The customized type is not supported yet.
-  Type type() const override { return Type::_unk; }
+  Type type() const override { return Type::UNK; }
 
   /// Get a field by `name`.
   template <typename T>
@@ -327,7 +327,7 @@ class ListBuilder : public FieldBuilder {
   }
 
   // Get element type.
-  Type type() const override { return Type::_list; }
+  Type type() const override { return Type::LIST; }
 
   /// Persist information to the corresponding BinaryTable.
   void Save() override;
diff --git a/lite/model_parser/naive_buffer/naive_buffer_test.cc b/lite/model_parser/naive_buffer/naive_buffer_test.cc
index 98789e8006817fceb4745bffd0c095da7ad360fc..7356c6213c3b1c85e63fed604f22652d780a369f 100644
--- a/lite/model_parser/naive_buffer/naive_buffer_test.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer_test.cc
@@ -24,9 +24,9 @@ TEST(NaiveBuffer, primary) {
   PrimaryBuilder<int32_t> p0(&table);
   PrimaryBuilder<float> p1(&table);
   StringBuilder p2(&table);
-  ASSERT_EQ(p0.type(), Type::_int32);
-  ASSERT_EQ(p1.type(), Type::_float32);
-  ASSERT_EQ(p2.type(), Type::_string);
+  ASSERT_EQ(p0.type(), Type::INT32);
+  ASSERT_EQ(p1.type(), Type::FLOAT32);
+  ASSERT_EQ(p2.type(), Type::STRING);
 
   p0.set(2008);
   p0.Save();
@@ -129,7 +129,7 @@ TEST(NBTestMsg, msg1) {
   int0->set(2008);
   int0->Save();
 
-  enum0->set(Type::_int64);
+  enum0->set(Type::INT64);
   enum0->Save();
 
   SetMsg0(msg0);
@@ -143,7 +143,7 @@ TEST(NBTestMsg, msg1) {
   msg1.Load();
 
   ASSERT_EQ(msg.GetField<Int32Builder>("int0").data(), 2008);
-  ASSERT_EQ(msg.GetField<enum_builder>("enum0").data(), Type::_int64);
+  ASSERT_EQ(msg.GetField<enum_builder>("enum0").data(), Type::INT64);
   TestMsg0(msg1.GetField<NBTestMsg0>("msg0"));
 }
 
diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc
index f4debc39a0d480f38e6d37e8e60d516def7f0b55..3996c933407233538a62ae9e197978f799ce06b0 100644
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -35,7 +35,8 @@ bool ElementwiseOp::InferShapeImpl() const {
     auto out_lod = param_.Out->mutable_lod();
     *out_lod = param_.X->lod();
   } else {
-    int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size());
+    size_t max_dim =
+        (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size());
     int axis = param_.axis;
     axis = (axis == -1 ? std::abs(static_cast<int>(x_dim.size() - y_dim.size()))
                        : axis);
@@ -48,12 +49,12 @@ bool ElementwiseOp::InferShapeImpl() const {
         y_dims_array[i] = 1;
       }
       if (axis + y_dim.size() < max_dim) {
-        for (int i = axis + y_dim.size(); i < max_dim; ++i) {
+        for (size_t i = axis + y_dim.size(); i < max_dim; ++i) {
           y_dims_array[i] = 1;
         }
       }
       x_dims_array = x_dim.Vectorize();
-      for (int i = 0; i < y_dim.size(); ++i) {
+      for (size_t i = 0; i < y_dim.size(); ++i) {
         y_dims_array[i + axis] = y_dim[i];
       }
     } else {
@@ -61,16 +62,16 @@ bool ElementwiseOp::InferShapeImpl() const {
         x_dims_array[i] = 1;
       }
       if (axis + x_dim.size() < max_dim) {
-        for (int i = axis + x_dim.size(); i < max_dim; ++i) {
+        for (size_t i = axis + x_dim.size(); i < max_dim; ++i) {
           x_dims_array[i] = 1;
         }
       }
       y_dims_array = y_dim.Vectorize();
-      for (int i = 0; i < x_dim.size(); ++i) {
+      for (size_t i = 0; i < x_dim.size(); ++i) {
         x_dims_array[i + axis] = x_dim[i];
       }
     }
-    for (int i = 0; i < max_dim; i++) {
+    for (size_t i = 0; i < max_dim; i++) {
       if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
         out_dims_array[i] = -1;
       } else {
diff --git a/lite/operators/expand_op.cc b/lite/operators/expand_op.cc
index 8e40a3b236609b1e83b5224efb462a1f803764df..ccb26e71dbe5dac18ecc5220d3697c737aee1c91 100644
--- a/lite/operators/expand_op.cc
+++ b/lite/operators/expand_op.cc
@@ -27,7 +27,7 @@ bool ExpandOpLite::CheckShape() const {
   CHECK_EQ(expand_size, x_dims_size)
       << "The number of expand_times size must be qual to the rank of "
          "Input(X).";
-  CHECK_LE(param_.X->dims().size(), 6)
+  CHECK_LE(param_.X->dims().size(), 6u)
       << "The rank of Input(X) must not be greater than 6.";
   return true;
 }
diff --git a/lite/operators/fill_constant_batch_size_like_op.cc b/lite/operators/fill_constant_batch_size_like_op.cc
index 5b0ebb38e717afea4dabe011c0161248e2113a02..b14d8c59a4ecc857d84ff4debac1740ea6fddd20 100644
--- a/lite/operators/fill_constant_batch_size_like_op.cc
+++ b/lite/operators/fill_constant_batch_size_like_op.cc
@@ -22,7 +22,7 @@ namespace operators {
 bool FillConstantBatchSizeLikeOp::CheckShape() const {
   CHECK(param_.out);
   CHECK(param_.input);
-  CHECK_GT(param_.shape.size(), 0);
+  CHECK_GT(param_.shape.size(), 0u);
   CHECK_GE(param_.input_dim_idx, 0);
   CHECK_GE(param_.output_dim_idx, 0);
   return true;
diff --git a/lite/operators/fill_constant_op.cc b/lite/operators/fill_constant_op.cc
index 565c4bbd16e01af340e728e28866268c1a845760..929966d57e05c368ce0e919804270ddacc9c8f93 100644
--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
@@ -34,7 +34,7 @@ bool FillConstantOp::InferShapeImpl() const {
       out_shape.push_back(shape_tensor_data[i]);
     }
   } else if (!shape_tensor_list.empty()) {
-    for (int i = 0; i < shape_tensor_list.size(); i++) {
+    for (size_t i = 0; i < shape_tensor_list.size(); i++) {
       out_shape.push_back(shape_tensor_list[i]->data<int>()[0]);
     }
   } else if (!param_.shape.empty()) {
diff --git a/lite/operators/flatten_op.cc b/lite/operators/flatten_op.cc
index b270dbf52f9a19f574e6f8967ff93e3a013e5737..300d516d6aa86799466ce6b02fb06212df1122f4 100644
--- a/lite/operators/flatten_op.cc
+++ b/lite/operators/flatten_op.cc
@@ -32,7 +32,7 @@ bool FlattenOp::InferShapeImpl() const {
   *out_lod = param_.x->lod();
 
   int64_t outer = 1, inner = 1;
-  for (int i = 0; i < x_dims.size(); ++i) {
+  for (size_t i = 0; i < x_dims.size(); ++i) {
     if (i < axis_) {
       outer *= x_dims[i];
     } else {
diff --git a/lite/operators/interpolate_op.cc b/lite/operators/interpolate_op.cc
index 0ef22e42903842ac41e9aca010f78796b5a32fcc..a96a602764ce5ea9ac7707ea43b58476e54d23f5 100644
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
@@ -48,14 +48,14 @@ bool InterpolateOp::InferShapeImpl() const {
   auto OutSize = param_.OutSize;
   auto Scale = param_.Scale;
   if (!SizeTensor.empty()) {
-    CHECK_EQ(SizeTensor.size(), 2)
+    CHECK_EQ(SizeTensor.size(), 2u)
         << "Input(SizeTensor)'size of Op(interpolate) must be 2. "
            "Attr(out_shape)'s length must be 2 for 4-D input tensor.";
     out_h = SizeTensor[0]->data<int>()[0];
     out_w = SizeTensor[1]->data<int>()[0];
   } else if (OutSize) {
     auto OutSize_dims = OutSize->dims();
-    CHECK_EQ(OutSize_dims.size(), 1) << "Input(OutSize)'s dims size must be 1";
+    CHECK_EQ(OutSize_dims.size(), 1u) << "Input(OutSize)'s dims size must be 1";
     CHECK_EQ(OutSize_dims[0], 2) << "OutSize's dim[0] must be 2";
     auto OutSize_data = OutSize->data<int>();
     out_h = OutSize_data[0];
diff --git a/lite/operators/is_empty_op.cc b/lite/operators/is_empty_op.cc
index a62470e4bb7f88d4c441dc8814bba7c4913ab3e4..7c742cee967e732d9f9794e3ae2329a5b8a9ca3e 100644
--- a/lite/operators/is_empty_op.cc
+++ b/lite/operators/is_empty_op.cc
@@ -19,15 +19,20 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-bool IsEmptyOp::CheckShape() const { return true; }
+bool IsEmptyOp::CheckShape() const {
+  CHECK(param_.X);
+  CHECK(param_.Out);
+  return true;
+}
 
-bool IsEmptyOp::InferShapeImpl() const { return true; }
+bool IsEmptyOp::InferShapeImpl() const {
+  param_.Out->Resize({1});
+  return true;
+}
 
 bool IsEmptyOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  param_.X = scope->FindTensor(opdesc.Input("X").front());
+  param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front());
   CHECK(param_.X);
   CHECK(param_.Out);
   return true;
diff --git a/lite/operators/matmul_op.cc b/lite/operators/matmul_op.cc
index 1cdcdfa16760385db059a4894e35d04bda51a85d..04a0fc97d77a181e45e3e829010934e22381ae12 100644
--- a/lite/operators/matmul_op.cc
+++ b/lite/operators/matmul_op.cc
@@ -24,19 +24,12 @@ bool MatMulOpLite::CheckShape() const {
   CHECK_OR_FALSE(param_.Y);
   CHECK_OR_FALSE(param_.Out);
 
-  return true;
-}
-
-bool MatMulOpLite::InferShapeImpl() const {
   const auto x_dims = param_.X->dims();
   const auto y_dims = param_.Y->dims();
   bool x_transpose = param_.transpose_X;
   bool y_transpose = param_.transpose_Y;
-  std::vector<int64_t> dim_out_vec;
 
-  if (x_dims.size() > 2 && y_dims.size() >= 2) {
-    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
-    // x: [B, M, K], y: [K, N], out: [B, M, N]
+  if (x_dims.size() > 1 && y_dims.size() > 1) {
     if (!x_transpose && !y_transpose) {
       CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 2])
           << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
@@ -54,48 +47,49 @@ bool MatMulOpLite::InferShapeImpl() const {
           << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
           << ")";
     }
+  } else if (x_dims.size() > 2 && y_dims.size() == 1) {
+    CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[0])
+        << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+        << ")";
+  }
+  return true;
+}
 
-    dim_out_vec.resize(x_dims.size());
-    for (size_t i = 0; i < x_dims.size() - 2; ++i) {
-      dim_out_vec[i] = x_dims[i];
+bool MatMulOpLite::InferShapeImpl() const {
+  const auto x_dims = param_.X->dims();
+  const auto y_dims = param_.Y->dims();
+  bool x_transpose = param_.transpose_X;
+  bool y_transpose = param_.transpose_Y;
+  std::vector<int64_t> dim_out_vec;
+
+  if ((x_dims.size() >= 2 && y_dims.size() >= 2) &&
+      (x_dims.size() != 2 || y_dims.size() != 2)) {
+    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [B, M, K], y: [K, N], out: [B, M, N]
+    // or
+    // x: [M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [M, K], y: [B, K, N], out: [B, M, N]
+    DDim dims = x_dims.size() >= y_dims.size() ? x_dims : y_dims;
+    dim_out_vec.resize(dims.size());
+    for (size_t i = 0; i < dims.size() - 2; ++i) {
+      dim_out_vec[i] = dims[i];
     }
     if (!x_transpose && !y_transpose) {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 2];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 1];
+      dim_out_vec[dims.size() - 2] = x_dims[x_dims.size() - 2];
+      dim_out_vec[dims.size() - 1] = y_dims[y_dims.size() - 1];
     } else if (!x_transpose && y_transpose) {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 2];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 2];
+      dim_out_vec[dims.size() - 2] = x_dims[x_dims.size() - 2];
+      dim_out_vec[dims.size() - 1] = y_dims[y_dims.size() - 2];
     } else if (x_transpose && !y_transpose) {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 1];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 1];
+      dim_out_vec[dims.size() - 2] = x_dims[x_dims.size() - 1];
+      dim_out_vec[dims.size() - 1] = y_dims[y_dims.size() - 1];
     } else {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 1];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 2];
+      dim_out_vec[dims.size() - 2] = x_dims[x_dims.size() - 1];
+      dim_out_vec[dims.size() - 1] = y_dims[y_dims.size() - 2];
     }
   } else if (x_dims.size() == 2 && y_dims.size() == 2) {
     // x: [M, K], y: [K, N], out: [M, N]
     // x: [M, K], y: [K, N], out: [M, N]
-    if (!x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (!x_transpose && y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[0], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else {
-      CHECK_EQ(x_dims[0], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    }
     dim_out_vec.resize(x_dims.size());
     if (x_transpose) {
       dim_out_vec[0] = x_dims[1];
@@ -109,9 +103,6 @@ bool MatMulOpLite::InferShapeImpl() const {
     }
   } else if (x_dims.size() > 2 && y_dims.size() == 1) {
     // x: [B, M, K], y: [K], out: [B, M]
-    CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[0])
-        << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-        << ")";
     dim_out_vec.resize(x_dims.size() - 1);
     for (size_t i = 0; i < dim_out_vec.size(); ++i) {
       dim_out_vec[i] = x_dims[i];
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 466de112fb2983e325b2bec17e90018984d7e233..30ee736de494e1a93902d1252db2672aeef38f2e 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -46,9 +46,9 @@ struct ParamBase {
 using param_t = Any;
 #define WITH_INT8_CONFIG             \
   bool enable_int8{false};           \
-  float input_scale{1.0};            \
+  float input_scale{1.0f};           \
   std::vector<float> weight_scale{}; \
-  float output_scale{1.0};           \
+  float output_scale{1.0f};          \
   int bit_length{8};
 
 /// ----------------------- Functional operators ------------------------------
@@ -346,8 +346,8 @@ struct ActivationParam : ParamBase {
   lite::Tensor* Prelu_alpha{};  // prelu param
   float Swish_beta;             // swish param
   // hard_sigmoid param
-  float hard_sigmoid_slope{0.2};
-  float hard_sigmoid_offset{0.5};
+  float hard_sigmoid_slope{0.2f};
+  float hard_sigmoid_offset{0.5f};
   // hard_swish param
   float hard_swish_threshold{6.0};
   float hard_swish_scale{6.0};
@@ -654,7 +654,7 @@ struct FakeQuantizeMovingAvgMaxAbsParam : ParamBase {
   lite::Tensor* out_accum{};
   int bit_length;
   bool is_test{true};
-  float moving_rate{0.9};
+  float moving_rate{0.9f};
 };
 
 struct FakeDequantizeMaxAbsParam : ParamBase {
@@ -748,9 +748,9 @@ struct LrnParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   int n{5};
-  float alpha{1e-4};
-  float beta{0.75};
-  float k{1.};
+  float alpha{1e-4f};
+  float beta{0.75f};
+  float k{1.f};
   std::string norm_region{"AcrossChannels"};
 };
 
@@ -792,8 +792,8 @@ struct MulticlassNmsParam : ParamBase {
   int background_label{0};
   float score_threshold{};
   int nms_top_k{};
-  float nms_threshold{0.3};
-  float nms_eta{1.0};
+  float nms_threshold{0.3f};
+  float nms_eta{1.0f};
   int keep_top_k;
   bool normalized{true};
 };
@@ -904,7 +904,7 @@ struct NormParam : ParamBase {
   lite::Tensor* Out{};
   lite::Tensor* Norm{};
   int axis{1};
-  float epsilon{1e-10};
+  float epsilon{1e-10f};
 };
 struct LayerNormParam : ParamBase {
   const lite::Tensor* X{};
@@ -914,7 +914,7 @@ struct LayerNormParam : ParamBase {
   lite::Tensor* Mean{};
   lite::Tensor* Variance{};
   int begin_norm_axis{1};
-  float epsilon{1e-5};
+  float epsilon{1e-5f};
 };
 
 struct LogicalParam : ParamBase {
@@ -1163,8 +1163,8 @@ struct AnchorGeneratorParam : ParamBase {
   std::vector<float> anchor_sizes{};
   std::vector<float> aspect_ratios{};
   std::vector<float> stride{};
-  std::vector<float> variances{{0.1, 0.1, 0.2, 0.2}};
-  float offset{0.5};
+  std::vector<float> variances{{0.1f, 0.1f, 0.2f, 0.2f}};
+  float offset{0.5f};
 
   lite::Tensor* Anchors{};
   lite::Tensor* Variances{};
@@ -1181,9 +1181,9 @@ struct GenerateProposalsParam : ParamBase {
   // attrs
   int pre_nms_topN{6000};
   int post_nms_topN{1000};
-  float nms_thresh{0.5};
-  float min_size{0.1};
-  float eta{1.0};
+  float nms_thresh{0.5f};
+  float min_size{0.1f};
+  float eta{1.0f};
 
   // outputs
   lite::Tensor* RpnRois{};
diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h
index 3fcf37e6348628d489e9a2097e2c8dac7eba3e3c..97f4a8a0083550fdcb0bc2d011e5e33d2d02011d 100644
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -105,7 +105,7 @@ inline void UpdatePadding(std::vector<int> *paddings,
                           const std::vector<int> &ksize) {
   // when padding_algorithm is "VALID" or "SAME"
   if (padding_algorithm == "SAME") {
-    for (int i = 0; i < strides.size(); ++i) {
+    for (size_t i = 0; i < strides.size(); ++i) {
       int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
       int pad_sum =
           std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
diff --git a/lite/operators/reduce_mean_op.cc b/lite/operators/reduce_mean_op.cc
index c5baca5e87068d267ada21854b7769bf2bc19461..0c788f35db3ce49657e6ad176f0d5f9c3c466ada 100644
--- a/lite/operators/reduce_mean_op.cc
+++ b/lite/operators/reduce_mean_op.cc
@@ -29,7 +29,7 @@ bool ReduceMeanOp::CheckShape() const {
   auto x_dims = param_.X->dims();
   int x_rank = x_dims.size();
   if (dims.size() != 0) {
-    for (int i = 0; i < dims.size(); i++) {
+    for (size_t i = 0; i < dims.size(); i++) {
       if (dims[i] < 0) {
         dims[i] = x_rank + dims[i];
       }
@@ -46,7 +46,7 @@ bool ReduceMeanOp::InferShapeImpl() const {
   bool keep_dim = param_.keep_dim;
   auto x_rank = x_dims.size();
   if (dims.size() != 0) {
-    for (int i = 0; i < dims.size(); i++) {
+    for (size_t i = 0; i < dims.size(); i++) {
       if (dims[i] < 0) {
         dims[i] = x_rank + dims[i];
       }
@@ -65,7 +65,7 @@ bool ReduceMeanOp::InferShapeImpl() const {
       out_dims.push_back(1);
     }
   } else {
-    for (int i = 0; i < x_dims.size(); i++) {
+    for (size_t i = 0; i < x_dims.size(); i++) {
       out_dims.push_back(x_dims[i]);
     }
     if (keep_dim) {
diff --git a/lite/operators/reshape_op.cc b/lite/operators/reshape_op.cc
index 5cbdd8edc31d7d45ed81176397c9b003d1e346ae..32bc91a3a0b9b852024e2e0f2ea36585e2a29892 100644
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
@@ -70,7 +70,7 @@ bool ReshapeOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
         param_.shape_tensor_vct.push_back(var->GetMutable<lite::Tensor>());
       }
     }
-    CHECK_GT(param_.shape_tensor_vct.size(), 0)
+    CHECK_GT(param_.shape_tensor_vct.size(), 0u)
         << "ShapeError: When `shape` in ReshapeOp is a list or tuple "
            "which contains Tensor, the shape's size can't be zero. "
            "But received shape's size is "
@@ -145,7 +145,7 @@ std::vector<DDim::value_type> ValidateShape(const std::vector<int> &shape,
           << "Only one input dimension of Attr(shape) can be unknown.";
       unk_dim_idx = i;
     } else if (shape[i] == copy_dim_val) {
-      CHECK_LT(static_cast<int>(i), input_dims.size())
+      CHECK_LT(i, input_dims.size())
           << "The index of dimension to copy from input shape must be less "
              "than the size of input shape.";
     } else {
diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc
index 3c64f24e48f750b367b75431333401329721a9b9..71e62c2ae729b4e1516a219888b9af3f7d994428 100644
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
@@ -41,11 +41,11 @@ bool SearchFcOpLite::CheckShape() const {
   CHECK_OR_FALSE(param_.Out);
 
   auto x_dims = param_.X->dims();
-  CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) should be 2.";
+  CHECK_EQ(x_dims.size(), 2u) << "The rank of X(Input) should be 2.";
   auto w_dims = param_.W->dims();
-  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+  CHECK_EQ(w_dims.size(), 2u) << "W should be 2-D tensor.";
   auto b_dims = param_.b->dims();
-  CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+  CHECK_EQ(b_dims.size(), 1u) << "b should be 1-D tensor.";
   CHECK_EQ(w_dims[1], x_dims[1]) << "wrong shape: w_dims[1] != x_dims[1]";
   return true;
 }
diff --git a/lite/operators/shape_op.cc b/lite/operators/shape_op.cc
index 1661a909268eb15ea2c4b393e9a2831d438465c7..a373918c6def26f1bb6adaacdb3e54598c5d9ab8 100644
--- a/lite/operators/shape_op.cc
+++ b/lite/operators/shape_op.cc
@@ -26,9 +26,8 @@ bool ShapeOpLite::CheckShape() const {
 }
 
 bool ShapeOpLite::InferShapeImpl() const {
-  std::vector<int64_t> shape_vec;
-  shape_vec.push_back(static_cast<int64_t>(param_.X->dims().size()));
-  param_.Out->Resize(shape_vec);
+  int64_t x_dims_size = param_.X->dims().size();
+  param_.Out->Resize({x_dims_size});
   return true;
 }
 
diff --git a/lite/operators/slice_op.cc b/lite/operators/slice_op.cc
index cf7d94535cce5fa32d0f917c9d39e4746cee1c30..ecbcc5c2c5925d320c0334889634e57ed894695f 100644
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
@@ -22,7 +22,7 @@ namespace operators {
 bool SliceOp::CheckShape() const {
   CHECK_OR_FALSE(param_.X);
   CHECK_OR_FALSE(param_.Out);
-  CHECK_LT(param_.X->dims().size(), 7)
+  CHECK_LT(param_.X->dims().size(), 7u)
       << "The rank of input X should be less than 7";
   return true;
 }
@@ -67,7 +67,7 @@ bool SliceOp::InferShapeImpl() const {
       }
       out_dims[decrease_axis[i]] = 0;
     }
-    for (int i = 0; i < out_dims.size(); ++i) {
+    for (size_t i = 0; i < out_dims.size(); ++i) {
       if (out_dims[i] != 0) {
         new_out_shape.push_back(out_dims[i]);
       }
@@ -108,7 +108,7 @@ bool SliceOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
 
   // The priority: StartsTensor > StartsTensorList > attr(starts).
   // The priority: EndsTensor > EndsTensorList > attr(ends).
-  int starts_size, ends_size;
+  size_t starts_size, ends_size;
   if (opdesc.HasAttr("starts")) {
     param_.starts = opdesc.GetAttr<std::vector<int>>("starts");
   }
@@ -129,7 +129,7 @@ bool SliceOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
       param_.StartsTensorList.push_back(
           scope->FindVar(var)->GetMutable<lite::Tensor>());
     }
-    CHECK_GT(param_.StartsTensorList.size(), 0)
+    CHECK_GT(param_.StartsTensorList.size(), 0u)
         << "StartsTensorList size can't be zero";
     starts_size = param_.StartsTensorList.size();
   }
@@ -141,7 +141,7 @@ bool SliceOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
       param_.EndsTensorList.push_back(
           scope->FindVar(var)->GetMutable<lite::Tensor>());
     }
-    CHECK_GT(param_.EndsTensorList.size(), 0)
+    CHECK_GT(param_.EndsTensorList.size(), 0u)
         << "EndsTensorList size can't be zero";
     ends_size = param_.EndsTensorList.size();
   }
diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc
index 71deb5631dd3523ebb0367b7db5e4049b785be7b..14cff7d692e3aaa37d95233931760f37c31e4526 100644
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
@@ -67,7 +67,7 @@ bool SplitOp::InferShapeImpl() const {
     axis = param_.axis_tensor->data<int>()[0];
   }
 
-  for (int j = 0; j < outs_dims.size(); ++j) {
+  for (size_t j = 0; j < outs_dims.size(); ++j) {
     outs[j]->Resize(outs_dims[j]);
   }
 
diff --git a/lite/operators/squeeze_op.cc b/lite/operators/squeeze_op.cc
index 633a6b4d4e45fd30bd72c8dcdfbbd96b8a8e8ebe..c34ad06debb0c4bb99d083bc7938ea26b2dcac9f 100644
--- a/lite/operators/squeeze_op.cc
+++ b/lite/operators/squeeze_op.cc
@@ -28,7 +28,7 @@ static DDim GetOutputShape(const std::vector<int> &squeeze_dims,
   // Determines number of dimensions of output tensor after squeeze.
   // Mark and count the dimensions need to be squeezed
   if (num_squeeze_dims == 0) {
-    for (int idx = 0; idx < in_dims.size(); ++idx) {
+    for (size_t idx = 0; idx < in_dims.size(); ++idx) {
       if (in_dims[idx] == 1) {
         should_squeeze[idx] = true;
         ++cnt_squeezed_dims;
@@ -57,7 +57,7 @@ static DDim GetOutputShape(const std::vector<int> &squeeze_dims,
 
   // Make output dimensions
   std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
-  for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
+  for (size_t in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
     if (!should_squeeze[in_idx]) {
       output_shape[out_idx++] = in_dims[in_idx];
     }
diff --git a/lite/operators/unsqueeze_op.cc b/lite/operators/unsqueeze_op.cc
index b5ae90248abb4f2496a4dbca1c12317cf3a7d325..0a7487d34eeb6fe149f956e2f48bdb411a690f14 100644
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
@@ -75,7 +75,7 @@ bool UnsqueezeOp::InferShapeImpl() const {
     final_axes = std::vector<int>(axes_tensor_data,
                                   axes_tensor_data + axes_tensor->numel());
   } else if (!axes_tensor_vct.empty()) {
-    for (int i = 0; i < axes_tensor_vct.size(); i++) {
+    for (size_t i = 0; i < axes_tensor_vct.size(); i++) {
       final_axes.push_back(axes_tensor_vct[i]->data<int>()[0]);
     }
   } else {
diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt
index c31e3ba58fc793aa92a5b37a59ad612e03c61a53..810a20abbc0d13897822cef2c99e5942e352a19f 100644
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
@@ -12,3 +12,17 @@ if(LITE_WITH_XPU)
       ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
       ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
 endif()
+
+if(LITE_WITH_RKNPU)
+    lite_cc_test(test_mobilenetv1_int8_rknpu SRCS test_mobilenetv1_int8_rknpu.cc
+      DEPS ${lite_model_test_DEPS} paddle_api_full
+      RKNPU_DEPS ${rknpu_kernels} ${rknpu_bridges}
+      ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
+endif()
+
+if(LITE_WITH_APU)
+    lite_cc_test(test_mobilenetv1_int8_apu SRCS test_mobilenetv1_int8_apu.cc
+      DEPS ${lite_model_test_DEPS} paddle_api_full
+      APU_DEPS ${apu_kernels} ${apu_bridges}
+      ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
+endif()
diff --git a/lite/tests/api/test_mobilenetv1_int8_apu.cc b/lite/tests/api/test_mobilenetv1_int8_apu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea4e2e3438aad5ce9bb35722a6332f408759bfee
--- /dev/null
+++ b/lite/tests/api/test_mobilenetv1_int8_apu.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+using namespace paddle::lite_api;  // NOLINT
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+inline int64_t ShapeProduction(std::vector<int64_t> shape) {
+  int64_t s = 1;
+  for (int64_t dim : shape) {
+    s *= dim;
+  }
+  return s;
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    std::cerr << "[ERROR] usage: ./" << argv[0]
+              << " model_dir [thread_num] [warmup_times] [repeat_times] "
+                 "[input_data_path] [output_data_path]"
+              << std::endl;
+    return -1;
+  }
+  std::string model_dir = argv[1];
+  int thread_num = 1;
+  if (argc > 2) {
+    thread_num = atoi(argv[2]);
+  }
+  int warmup_times = 5;
+  if (argc > 3) {
+    warmup_times = atoi(argv[3]);
+  }
+  int repeat_times = 10;
+  if (argc > 4) {
+    repeat_times = atoi(argv[4]);
+  }
+  std::string input_data_path;
+  if (argc > 5) {
+    input_data_path = argv[5];
+  }
+  std::string output_data_path;
+  if (argc > 6) {
+    output_data_path = argv[6];
+  }
+  paddle::lite_api::CxxConfig config;
+  config.set_model_dir(model_dir);
+  config.set_threads(thread_num);
+  config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH);
+  config.set_valid_places(
+      {paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}});
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  std::unique_ptr<paddle::lite_api::Tensor> input_tensor(
+      std::move(predictor->GetInput(0)));
+  input_tensor->Resize({1, 3, 224, 224});
+  auto input_data = input_tensor->mutable_data<float>();
+  auto input_size = ShapeProduction(input_tensor->shape());
+
+  // test loop
+  int total_imgs = 500;
+  float test_num = 0;
+  float top1_num = 0;
+  float top5_num = 0;
+  int output_len = 1000;
+  std::vector<int> index(1000);
+  bool debug = true;  // false;
+  int show_step = 500;
+  for (int i = 0; i < total_imgs; i++) {
+    // set input
+    std::string filename = input_data_path + "/" + std::to_string(i);
+    std::ifstream fs(filename, std::ifstream::binary);
+    if (!fs.is_open()) {
+      std::cout << "open input file fail.";
+    }
+    auto input_data_tmp = input_data;
+    for (int i = 0; i < input_size; ++i) {
+      fs.read(reinterpret_cast<char*>(input_data_tmp), sizeof(*input_data_tmp));
+      input_data_tmp++;
+    }
+    int label = 0;
+    fs.read(reinterpret_cast<char*>(&label), sizeof(label));
+    fs.close();
+
+    if (debug && i % show_step == 0) {
+      std::cout << "input data:" << std::endl;
+      std::cout << input_data[0] << " " << input_data[10] << " "
+                << input_data[input_size - 1] << std::endl;
+      std::cout << "label:" << label << std::endl;
+    }
+
+    // run
+    predictor->Run();
+    auto output0 = predictor->GetOutput(0);
+    auto output0_data = output0->data<float>();
+
+    // get output
+    std::iota(index.begin(), index.end(), 0);
+    sort(index.begin(), index.end(), [output0_data](size_t i1, size_t i2) {
+      return output0_data[i1] > output0_data[i2];
+    });
+    test_num++;
+    if (label == index[0]) {
+      top1_num++;
+    }
+    for (int i = 0; i < 5; i++) {
+      if (label == index[i]) {
+        top5_num++;
+      }
+    }
+
+    if (debug && i % show_step == 0) {
+      std::cout << index[0] << " " << index[1] << " " << index[2] << " "
+                << index[3] << " " << index[4] << std::endl;
+      std::cout << output0_data[index[0]] << " " << output0_data[index[1]]
+                << " " << output0_data[index[2]] << " "
+                << output0_data[index[3]] << " " << output0_data[index[4]]
+                << std::endl;
+      std::cout << output0_data[630] << std::endl;
+    }
+    if (i % show_step == 0) {
+      std::cout << "step " << i << "; top1 acc:" << top1_num / test_num
+                << "; top5 acc:" << top5_num / test_num << std::endl;
+    }
+  }
+  std::cout << "final result:" << std::endl;
+  std::cout << "top1 acc:" << top1_num / test_num << std::endl;
+  std::cout << "top5 acc:" << top5_num / test_num << std::endl;
+  return 0;
+}
diff --git a/lite/tests/api/test_mobilenetv1_int8_rknpu.cc b/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8c123088b3f69560abf3555dd2e459af926426ef
--- /dev/null
+++ b/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/time.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+inline int64_t ShapeProduction(std::vector<int64_t> shape) {
+  int64_t s = 1;
+  for (int64_t dim : shape) {
+    s *= dim;
+  }
+  return s;
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    std::cerr << "[ERROR] usage: ./" << argv[0]
+              << " model_dir [thread_num] [warmup_times] [repeat_times] "
+                 "[input_data_path] [output_data_path]"
+              << std::endl;
+    return -1;
+  }
+  std::string model_dir = argv[1];
+  int thread_num = 1;
+  if (argc > 2) {
+    thread_num = atoi(argv[2]);
+  }
+  int warmup_times = 5;
+  if (argc > 3) {
+    warmup_times = atoi(argv[3]);
+  }
+  int repeat_times = 10;
+  if (argc > 4) {
+    repeat_times = atoi(argv[4]);
+  }
+  std::string input_data_path;
+  if (argc > 5) {
+    input_data_path = argv[5];
+  }
+  std::string output_data_path;
+  if (argc > 6) {
+    output_data_path = argv[6];
+  }
+  paddle::lite_api::CxxConfig config;
+  config.set_model_dir(model_dir);
+  config.set_threads(thread_num);
+  config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH);
+  config.set_valid_places(
+      {paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}});
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  std::unique_ptr<paddle::lite_api::Tensor> input_tensor(
+      std::move(predictor->GetInput(0)));
+  input_tensor->Resize({1, 3, 224, 224});
+  auto input_data = input_tensor->mutable_data<float>();
+  auto input_size = ShapeProduction(input_tensor->shape());
+  if (input_data_path.empty()) {
+    for (int i = 0; i < input_size; i++) {
+      input_data[i] = 1;
+    }
+  } else {
+    std::fstream fs(input_data_path, std::ios::in);
+    if (!fs.is_open()) {
+      std::cerr << "open input data file failed." << std::endl;
+      return -1;
+    }
+    for (int i = 0; i < input_size; i++) {
+      fs >> input_data[i];
+    }
+  }
+
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < repeat_times; ++i) {
+    predictor->Run();
+  }
+
+  std::cout << "Model: " << model_dir << ", threads num " << thread_num
+            << ", warmup times: " << warmup_times
+            << ", repeat times: " << repeat_times << ", spend "
+            << (GetCurrentUS() - start) / repeat_times / 1000.0
+            << " ms in average." << std::endl;
+
+  std::unique_ptr<const paddle::lite_api::Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto output_data = output_tensor->data<float>();
+  auto output_size = ShapeProduction(output_tensor->shape());
+  std::cout << "output data:";
+  for (int i = 0; i < output_size; i += 100) {
+    std::cout << "[" << i << "] " << output_data[i] << std::endl;
+  }
+  return 0;
+}
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index cb454c4da5bc15d65e480f55dabe01124bf18ca5..c74b86cb5cca3b7d08d14507821ff103c796ca07 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -1,4 +1,4 @@
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_RKNPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
     lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -78,6 +78,7 @@ endif()
     lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_interp_compute SRCS interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_is_empty_compute SRCS is_empty_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels}  ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
diff --git a/lite/tests/kernels/compare_compute_test.cc b/lite/tests/kernels/compare_compute_test.cc
index abb9a9d503c1b68a9022d2347122906a4a4d5a69..fbea52ab0d160982c1f5dd8385329a822c20e8e9 100644
--- a/lite/tests/kernels/compare_compute_test.cc
+++ b/lite/tests/kernels/compare_compute_test.cc
@@ -216,7 +216,7 @@ TEST(Compare_OP_NPU, precision) {
 }
 #elif defined(LITE_WITH_ARM)
 TEST(Compare_OP_ARM, precision) {
-  Place place{TARGET(kARM)};
+  Place place{TARGET(kHost)};
   float abs_error = 1e-5;
   for (auto op : std::vector<std::string>{"equal",
                                           "not_equal",
diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc
index 750b4c42d40a52894a90700cf48838c5c9a4980c..4d0ad1ab47a17c3e8d227b9e0482d7cbe21ab7e2 100644
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
@@ -91,10 +91,12 @@ class GatherComputeTest : public arena::TestCase {
 };
 
 TEST(Gather, precision) {
-  LOG(INFO) << "test gather op";
   float abs_error = 2e-5;
   Place place;
-#if defined(LITE_WITH_ARM)
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
@@ -104,8 +106,7 @@ TEST(Gather, precision) {
 
   for (auto x_dims :
        std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
-    for (auto index_dims :
-         std::vector<std::vector<int64_t>>{{3, 1}, {7, 1}, {10, 1}}) {
+    for (auto index_dims : std::vector<std::vector<int64_t>>{{3}, {7}, {10}}) {
       std::unique_ptr<arena::TestCase> tester(
           new GatherComputeTest(place, "def", DDim(x_dims), DDim(index_dims)));
       arena::Arena arena(std::move(tester), place, abs_error);
diff --git a/lite/tests/kernels/is_empty_compute_test.cc b/lite/tests/kernels/is_empty_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2facb0d3f0c46ac5a5dd96f836363583aa5b639
--- /dev/null
+++ b/lite/tests/kernels/is_empty_compute_test.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+class IsEmptyComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "x";
+  std::string out_ = "out";
+  DDim x_dims_;
+
+ public:
+  IsEmptyComputeTester(const Place& place,
+                       const std::string& alias,
+                       DDim x_dims)
+      : TestCase(place, alias), x_dims_(x_dims) {}
+
+  void RunBaseline(Scope* scope) override {
+    const auto* x = scope->FindTensor(x_);
+    auto* out = scope->NewTensor(out_);
+
+    out->Resize(DDim({1}));
+    auto* out_data = out->mutable_data<bool>();
+    out_data[0] = (x->numel() == 0) ? true : false;
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("is_empty");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(x_dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, din.data());
+  }
+};
+
+void TestIsEmptyHelper(Place place,
+                       float abs_error,
+                       std::vector<int64_t> x_dims) {
+  std::unique_ptr<arena::TestCase> tester(
+      new IsEmptyComputeTester(place, "def", DDim(x_dims)));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+void TestIsEmpty(Place place, float abs_error) {
+  TestIsEmptyHelper(place, abs_error, {2, 3, 4, 5});
+  TestIsEmptyHelper(place, abs_error, {0});
+}
+
+TEST(is_empty, precision) {
+  Place place;
+  float abs_error = 1e-5;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
+#else
+  return;
+#endif
+
+  TestIsEmpty(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc
index 8c55e31863ff7b38c51e751583d4a92b1f185d86..c4f9277d86128df808351007dda8d300da15a526 100644
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
@@ -21,6 +21,7 @@
 namespace paddle {
 namespace lite {
 
+template <typename T>
 class LookupTableComputeTest : public arena::TestCase {
  protected:
   // common attributes for this op.
@@ -64,7 +65,7 @@ class LookupTableComputeTest : public arena::TestCase {
     out->Resize(out_dims);
     out->set_lod(ids->lod());
 
-    auto ids_data = ids->data<int64_t>();
+    auto ids_data = ids->data<T>();
     auto ids_size = ids_dims.production();
     auto w_data = w->data<float>();
     auto w_rows = w_dims[0];
@@ -95,9 +96,8 @@ class LookupTableComputeTest : public arena::TestCase {
   }
 
   void PrepareData() override {
-    std::vector<int64_t> ids(ids_dims_.production());
-    fill_data_rand<int64_t>(
-        ids.data(), 0, w_dims_[0] - 1, ids_dims_.production());
+    std::vector<T> ids(ids_dims_.production());
+    fill_data_rand<T>(ids.data(), 0, w_dims_[0] - 1, ids_dims_.production());
 
     std::vector<float> w(w_dims_.production());
     fill_data_rand(w.data(), -1.f, 1.f, w_dims_.production());
@@ -109,9 +109,12 @@ class LookupTableComputeTest : public arena::TestCase {
 
 TEST(LookupTable, precision) {
   LOG(INFO) << "test lookup_table op";
-  float abs_error = 2e-5;
+  float abs_error = 1e-5;
   Place place;
-#if defined(LITE_WITH_ARM)
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;
+#elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
@@ -119,18 +122,25 @@ TEST(LookupTable, precision) {
   return;
 #endif
 
+#if defined(LITE_WITH_NPU)
+  using ID_T = int;
+#else
+  using ID_T = int64_t;
+#endif
+
   for (auto ids_dims :
        std::vector<std::vector<int64_t>>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) {
     for (auto w_dims :
          std::vector<std::vector<int64_t>>{{4, 2}, {6, 8}, {12, 15}}) {
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_XPU) && defined(LITE_WITH_NPU)
       for (auto padding_idx :
-           std::vector<int64_t>{-1}) {  // Only -1 is supported by XPU
+           std::vector<int64_t>{-1}) {  // Only -1 is supported by XPU or NPU
 #else
       for (auto padding_idx : std::vector<int64_t>{-1, 0, w_dims[0] - 1}) {
 #endif
-        std::unique_ptr<arena::TestCase> tester(new LookupTableComputeTest(
-            place, "def", DDim(ids_dims), DDim(w_dims), padding_idx));
+        std::unique_ptr<arena::TestCase> tester(
+            new LookupTableComputeTest<ID_T>(
+                place, "def", DDim(ids_dims), DDim(w_dims), padding_idx));
         arena::Arena arena(std::move(tester), place, abs_error);
         arena.TestPrecision();
       }
diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc
index 4fba28e2ab982b1f15e48c95dfa247b2ea56c1ae..3a866b6cf22cf67c3f5a60e5a4aa8603cee6a1a3 100644
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
@@ -204,6 +204,8 @@ TEST(Reshape, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
 #elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
 #else
diff --git a/lite/tests/kernels/shape_compute_test.cc b/lite/tests/kernels/shape_compute_test.cc
index 23eab7c94f6a4a2c9b94239822ee9804fb728386..79e20736c289bdb047c44cf11f317e596a895c92 100644
--- a/lite/tests/kernels/shape_compute_test.cc
+++ b/lite/tests/kernels/shape_compute_test.cc
@@ -16,13 +16,14 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
 class ShapeComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
-  std::string x_ = "Input";
+  std::string input_ = "Input";
   std::string out_ = "Out";
   DDim dims_;
 
@@ -31,7 +32,7 @@ class ShapeComputeTester : public arena::TestCase {
       : TestCase(place, alias), dims_(dims) {}
 
   void RunBaseline(Scope* scope) override {
-    const auto* input = scope->FindTensor(x_);
+    const auto* input = scope->FindTensor(input_);
     CHECK(input);
     auto* out = scope->NewTensor(out_);
     CHECK(out);
@@ -45,42 +46,46 @@ class ShapeComputeTester : public arena::TestCase {
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("shape");
-    op_desc->SetInput("Input", {x_});
+    op_desc->SetInput("Input", {input_});
     op_desc->SetOutput("Out", {out_});
   }
 
   void PrepareData() override {
-    std::vector<float> in_data(dims_.production());
-    for (int i = 0; i < dims_.production(); ++i) {
-      in_data[i] = i;
-    }
-    SetCommonTensor(x_, dims_, in_data.data());
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
   }
 };
 
-void test_shape(Place place) {
-  for (int N : {1, 2, 3, 4}) {
-    for (int C : {1, 2, 3, 4}) {
-      for (int H : {1, 2, 3, 4}) {
-        for (int W : {1, 2, 3, 4}) {
-          std::unique_ptr<arena::TestCase> tester(
-              new ShapeComputeTester(place, "def", DDim({N, C, H, W})));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
+void TestShapeHelper(Place place,
+                     float abs_error,
+                     std::vector<int64_t> x_dims) {
+  std::unique_ptr<arena::TestCase> tester(
+      new ShapeComputeTester(place, "def", DDim(x_dims)));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+void test_shape(Place place, float abs_error) {
+  TestShapeHelper(place, abs_error, {2, 3, 4, 5});
+  TestShapeHelper(place, abs_error, {3, 4, 5});
+  TestShapeHelper(place, abs_error, {4, 5});
+  TestShapeHelper(place, abs_error, {5});
 }
 
 TEST(shape, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_shape(place);
+  Place place;
+  float abs_error = 1e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
+#else
+  return;
 #endif
+
+  test_shape(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tools/benchmark.sh b/lite/tools/benchmark.sh
index 23bb183ec9711a43def5636f15a9b17795f0ec24..3af8176f97896d04b85195530f9b554fe4ddc5f7 100644
--- a/lite/tools/benchmark.sh
+++ b/lite/tools/benchmark.sh
@@ -2,12 +2,12 @@
 set -e
 
 # Check input
-if [ $# -lt  2 ];
+if [ $# -lt  3 ];
 then
     echo "Input error"
     echo "Usage:"
-    echo "  sh benchmark.sh benchmark_bin_path benchmark_models_path <result_filename> <input_shape> <power_mode: [0|1|2|3]> <is_run_model_optimize: [true|false]> <is_run_quantized_model: [trur|false]>"
-    echo "\npower_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores,  3 for no bind."
+    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename>"
+    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename> <is_run_model_optimize: [true|false]>"
     exit
 fi
 
@@ -15,10 +15,8 @@ fi
 ANDROID_DIR=/data/local/tmp
 BENCHMARK_BIN=$1
 MODELS_DIR=$2
+RESULT_FILENAME=$3
 
-RESULT_FILENAME=result.txt
-INPUT_SHAPE=1,3,244,244
-POWER_MODE=3
 WARMUP=10
 REPEATS=30
 IS_RUN_MODEL_OPTIMIZE=false
@@ -27,25 +25,9 @@ NUM_THREADS_LIST=(1 2 4)
 MODELS_LIST=$(ls $MODELS_DIR)
 
 # Check input
-if [ $# -gt  2 ];
-then
-    RESULT_FILENAME=$3
-fi
 if [ $# -gt  3 ];
 then
-    INPUT_SHAPE=$4
-fi
-if [ $# -gt  4 ];
-then
-    POWER_MODE=$5
-fi
-if [ $# -gt  5 ];
-then
-    IS_RUN_MODEL_OPTIMIZE=$6
-fi
-if [ $# -gt  6 ];
-then
-    IS_RUN_QUANTIZED_MODEL=$7
+    IS_RUN_MODEL_OPTIMIZE=$4
 fi
 
 # Adb push benchmark_bin, models
@@ -54,26 +36,31 @@ adb shell chmod +x $ANDROID_DIR/benchmark_bin
 adb push $MODELS_DIR $ANDROID_DIR
 
 # Run benchmark
-adb shell "echo 'PaddleLite Benchmark (in ms)\n' > $ANDROID_DIR/$RESULT_FILENAME"
+adb shell "echo 'PaddleLite Benchmark' > $ANDROID_DIR/$RESULT_FILENAME"
 for threads in ${NUM_THREADS_LIST[@]}; do
-    adb shell "echo threads=$threads warmup=$WARMUP repeats=$REPEATS input_shape=$INPUT_SHAPE power_mode=$POWER_MODE >> $ANDROID_DIR/$RESULT_FILENAME"
+    adb shell "echo Threads=$threads Warmup=$WARMUP Repeats=$REPEATS >> $ANDROID_DIR/$RESULT_FILENAME"
     for model_name in ${MODELS_LIST[@]}; do
       echo "Model=$model_name Threads=$threads"
-      adb shell "$ANDROID_DIR/benchmark_bin \
+      if [ "$IS_RUN_MODEL_OPTIMIZE" = true ]; 
+      then
+          adb shell "$ANDROID_DIR/benchmark_bin \
                    --model_dir=$ANDROID_DIR/${MODELS_DIR}/$model_name \
-                   --input_shape=$INPUT_SHAPE \
                    --warmup=$WARMUP \
                    --repeats=$REPEATS \
                    --threads=$threads \
-                   --power_mode=$POWER_MODE \
-                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME \
-                   --run_model_optimize=$IS_RUN_MODEL_OPTIMIZE \
-                   --is_quantized_model=$IS_RUN_QUANTIZED_MODEL"
+                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME"
+      else
+          adb shell "$ANDROID_DIR/benchmark_bin \
+                   --optimized_model_path=$ANDROID_DIR/${MODELS_DIR}/$model_name \
+                   --warmup=$WARMUP \
+                   --repeats=$REPEATS \
+                   --threads=$threads \
+                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME"
+      fi
     done
     adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
 done
-adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
-adb shell "echo power_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores,  3 for no bind >> $ANDROID_DIR/$RESULT_FILENAME"
+
 # Adb pull benchmark result, show result
 adb pull $ANDROID_DIR/$RESULT_FILENAME .
 echo "\n--------------------------------------"
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index e7394fcb6edbd7a2f4b564b7a0e7d5aa43506843..bd5d2d37aa7b80dd01faebd8a8d88ba0135e37a4 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -27,6 +27,10 @@ NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.hua
 BUILD_XPU=OFF
 BUILD_XTCL=OFF
 XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/"
+BUILD_APU=OFF
+APU_DDK_ROOT="$(pwd)/apu_sdk_lib/"
+BUILD_RKNPU=OFF
+RKNPU_DDK_ROOT="$(pwd)/rknpu/"
 LITE_WITH_ARM_LANG=OFF
 
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
@@ -141,6 +145,10 @@ function make_tiny_publish_so {
       -DLITE_WITH_XPU=$BUILD_XPU \
       -DLITE_WITH_XTCL=$BUILD_XTCL \
       -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_APU=$BUILD_APU \
+      -DAPU_DDK_ROOT=$APU_DDK_ROOT \
+      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
+      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
   make publish_inference -j$NUM_PROC
@@ -230,7 +238,11 @@ function make_full_publish_so {
       -DLITE_WITH_XPU=$BUILD_XPU \
       -DLITE_WITH_XTCL=$BUILD_XTCL \
       -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
+      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
       -DLITE_WITH_TRAIN=$BUILD_TRAIN \
+      -DLITE_WITH_APU=$BUILD_APU \
+      -DAPU_DDK_ROOT=$APU_DDK_ROOT \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
   make publish_inference -j$NUM_PROC
@@ -265,6 +277,10 @@ function make_all_tests {
       -DLITE_WITH_XPU=$BUILD_XPU \
       -DLITE_WITH_XTCL=$BUILD_XTCL \
       -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_APU=$BUILD_APU \
+      -DAPU_DDK_ROOT=$APU_DDK_ROOT \
+      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
+      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
   make lite_compile_deps -j$NUM_PROC
@@ -365,7 +381,6 @@ function make_x86 {
             -DWITH_LITE=ON \
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
             -DLITE_WITH_ARM=OFF \
-            -DLITE_WITH_PYTHON=$BUILD_PYTHON \
             -DWITH_GPU=OFF \
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON \
@@ -499,6 +514,22 @@ function main {
                 XPU_SDK_ROOT="${i#*=}"
                 shift
                 ;;
+            --build_apu=*)
+                BUILD_APU="${i#*=}"
+                shift
+                ;;
+           --apu_ddk_root=*)
+                APU_DDK_ROOT="${i#*=}"
+                shift
+                ;;
+            --build_rknpu=*)
+                BUILD_RKNPU="${i#*=}"
+                shift
+                ;;
+            --rknpu_ddk_root=*)
+                RKNPU_DDK_ROOT="${i#*=}"
+                shift
+                ;;
             tiny_publish)
                 make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL 
                 shift
diff --git a/lite/tools/build_rknpu.sh b/lite/tools/build_rknpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..aa2fb5a124077b43f65537ab12715602ab1fe6b8
--- /dev/null
+++ b/lite/tools/build_rknpu.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+set -ex
+
+# global variables with default value
+ARM_OS="armlinux"                    # android only yet
+ARM_ABI="armv8"                     # armv8, armv7
+ARM_LANG="gcc"                      # gcc only yet
+DDK_ROOT="$(pwd)/rknpu"       
+TARGET_NAME="test_subgraph_pass"    # default target
+BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
+WITH_TESTING=ON 	            # ON/OFF
+SHUTDOWN_LOG=OFF                    # ON(disable logging)/OFF
+ON_TINY_PUBLISH=OFF                 # ON(tiny publish)/OFF(full publish)
+
+function print_usage {
+    echo -e "\nUSAGE:"
+    echo
+    echo "----------------------------------------"
+    echo -e "--arm_os=<os> android only yet."
+    echo -e "--arm_abi=<abi> armv8, armv7 yet."
+    echo -e "--arm_lang=<gcc>"
+    echo -e "--ddk_root=<hiai_ddk_root>"
+    echo -e "--target_name=<target_name>"
+    echo "----------------------------------------"
+    echo
+}
+
+# for code gen, a source file is generated after a test, 
+# but is dependended by some targets in cmake.
+# here we fake an empty file to make cmake works.
+function prepare_workspace {
+    # in build directory
+    # 1. Prepare gen_code file
+    GEN_CODE_PATH_PREFIX=lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    # 2.Prepare debug tool
+    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
+    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
+    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
+}
+
+function prepare_thirdparty {
+    readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+
+    readonly workspace=$PWD
+    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+        rm -rf $workspace/third-party
+
+         if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+            wget $THIRDPARTY_TAR
+        fi
+        tar xzf third-party-05b862.tar.gz
+    else
+        git submodule update --init --recursive
+    fi
+}
+
+function build_npu {
+    cur_dir=$(pwd)
+
+    prepare_thirdparty
+
+    local publish_dir
+    if [[ "${ON_TINY_PUBLISH}" == "ON" ]]; then
+        WITH_TESTING=OFF
+        SHUTDOWN_LOG=ON
+        publish_dir="tiny_publish"
+    else
+        publish_dir="full_publish"
+    fi
+    build_dir=$cur_dir/build.lite.rknpu.${ARM_OS}.${ARM_ABI}.${ARM_LANG}.${publish_dir}
+    mkdir -p $build_dir
+    cd $build_dir
+
+    # NPU libs need API LEVEL 24 above
+    prepare_workspace
+    cmake .. \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_LITE=ON \
+        -DLITE_WITH_CUDA=OFF \
+        -DLITE_WITH_X86=OFF \
+        -DLITE_WITH_NPU=OFF \
+        -DLITE_WITH_JAVA=OFF \
+        -DLITE_WITH_ARM=ON \
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON	\
+        -DWITH_ARM_DOTPROD=ON   \
+        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
+        -DWITH_TESTING=${WITH_TESTING} \
+        -DLITE_SHUTDOWN_LOG=${SHUTDOWN_LOG} \
+        -DLITE_ON_TINY_PUBLISH=${ON_TINY_PUBLISH} \
+        -DARM_TARGET_OS=${ARM_OS} \
+        -DARM_TARGET_ARCH_ABI=${ARM_ABI} \
+        -DARM_TARGET_LANG=${ARM_LANG} \
+        -DLITE_WITH_RKNPU=ON \
+        -DRKNPU_DDK_ROOT=${DDK_ROOT}
+
+    make $TARGET_NAME -j2
+
+    cd -
+    echo "Done"
+}
+
+function main {
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            --target_name=*)
+                TARGET_NAME="${i#*=}"
+                shift
+                ;;
+            --arm_os=*)
+                ARM_OS="${i#*=}"
+                shift
+                ;;
+            --arm_abi=*)
+                ARM_ABI="${i#*=}"
+                shift
+                ;;
+            --arm_lang=*)
+                ARM_LANG="${i#*=}"
+                shift
+                ;;
+            --android_stl=*)
+                ANDROID_STL="${i#*=}"
+                shift
+                ;;
+            --build_extra=*)
+                BUILD_EXTRA="${i#*=}"
+                shift
+                ;;
+            --ddk_root=*)
+                DDK_ROOT="${i#*=}"
+                shift
+                ;;
+            build)
+                build_npu
+                shift
+                ;;
+            full_publish)
+                TARGET_NAME=publish_inference
+                build_npu
+                shift
+                ;;
+            tiny_publish)
+                ON_TINY_PUBLISH=ON
+                TARGET_NAME=publish_inference
+                build_npu
+                shift
+                ;;
+            *)
+                # unknown option
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+}
+
+main $@
diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py
index 560174bc632bec89b9655ff89fd5eeb9e7db7786..abb60f6141fbee53916a7db1711cf606afb09924 100644
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -56,8 +56,8 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
 ops_lines = []
 
 # valid targets and valid_ops
-valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"]
-valid_ops = [[], [], [], [], [], [], [], [], [], []]
+valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU"]
+valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
 class TargetType:
     kUnk = 0
     kHost = 1
@@ -65,10 +65,15 @@ class TargetType:
     kCUDA = 3
     kARM = 4
     kOpenCL = 5
+    kAny = 6  # any target
     kFPGA = 7
     kNPU = 8
     kXPU = 9
-    kAny = 6  # any target
+    kBM = 10
+    kMLU = 11
+    kRKNPU = 12
+    kAPU = 13
+
 
 # record op_info of valid kernels into `valid_ops` according to different target type
 with open(kernels_list_path) as f:
diff --git a/lite/utils/string.h b/lite/utils/string.h
index 5269525b64f473f1018e183613c087886dba97d6..ada51d0b85d7536bfc937a7b1b8368a0f0e053be 100644
--- a/lite/utils/string.h
+++ b/lite/utils/string.h
@@ -16,6 +16,7 @@
 #include <stdarg.h>  // For va_start, etc.
 #include <algorithm>
 #include <cstring>
+#include <iterator>
 #include <memory>  // For std::unique_ptr
 #include <string>
 #include <vector>
diff --git a/lite/utils/variant.h b/lite/utils/variant.h
index 146ea586e46db0f1145f7ca7a9c5b7bd7bfb432e..2f1606c4585ab5a8feaf2fa6ad49b76ba9d7316d 100644
--- a/lite/utils/variant.h
+++ b/lite/utils/variant.h
@@ -21,12 +21,7 @@ limitations under the License. */
 // https://github.com/PaddlePaddle/Paddle/issues/3386
 
 // some platform-independent defintion
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
+#include "lite/utils/macros.h"
 
 #if !defined(_WIN32)
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/mobile/src/framework/load_ops.h b/mobile/src/framework/load_ops.h
index e04db5d1e8d6e2a75343cbee15269d607f71b7c9..536ab11313528830bf8ec73f68581fba44509f0e 100755
--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
@@ -14,10 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+// some platform-independent defintion
+#include "lite/utils/macros.h"
+
 #ifdef PADDLE_MOBILE_CPU
 #define LOAD_CPU_OP(op_type)                                           \
   extern int TouchOpRegistrar_##op_type##_##cpu();                     \
-  static int use_op_itself_##op_type##_##cpu __attribute__((unused)) = \
+  static int use_op_itself_##op_type##_##cpu UNUSED = \
       TouchOpRegistrar_##op_type##_##cpu()
 #else
 #define LOAD_CPU_OP(op_type)
@@ -26,7 +29,7 @@ limitations under the License. */
 #ifdef PADDLE_MOBILE_CL
 #define LOAD_GPU_CL_OP(op_type)                                       \
   extern int TouchOpRegistrar_##op_type##_##cl();                     \
-  static int use_op_itself_##op_type##_##cl __attribute__((unused)) = \
+  static int use_op_itself_##op_type##_##cl UNUSED = \
       TouchOpRegistrar_##op_type##_##cl()
 #else
 #define LOAD_GPU_CL_OP(op_type)
@@ -35,7 +38,7 @@ limitations under the License. */
 #ifdef PADDLE_MOBILE_FPGA
 #define LOAD_FPGA_OP(op_type)                                           \
   extern int TouchOpRegistrar_##op_type##_##fpga();                     \
-  static int use_op_itself_##op_type##_##fpga __attribute__((unused)) = \
+  static int use_op_itself_##op_type##_##fpga UNUSED = \
       TouchOpRegistrar_##op_type##_##fpga()
 #else
 #define LOAD_FPGA_OP(op_type)
@@ -43,7 +46,7 @@ limitations under the License. */
 
 #define LOAD_FUSION_MATCHER(op_type)                                       \
   extern int TouchFusionMatcherRegistrar_##op_type();                      \
-  static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \
+  static int use_fusion_matcher_itself_##op_type UNUSED = \
       TouchFusionMatcherRegistrar_##op_type();
 
 #define LOAD_OP(op_type)   \