Merge pull request #97 from PaddlePaddle/develop

pull code

Merge pull request #97 from PaddlePaddle/develop
pull code
82e4b53d · HappyAngel · GitHub · f92ccf5b · b445941f · 82e4b53d
186 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 message(STATUS "AR tools: ${CMAKE_AR}")
+if(WIN32)
+    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+    set(CMAKE_SUPPRESS_REGENERATION ON)
+    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+    if (MSVC_STATIC_CRT)
+      set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+      set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+      set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+      set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    endif()
+    add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
+    add_compile_options(/MP)
+    message(STATUS "Using parallel compiling (/MP)")
+    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
+    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+endif()
 if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    find_package(CUDA QUIET)
 endif()
@@ -59,10 +84,12 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_RKNPU  "Enable RKNPU in lite mode"  OFF)
 lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
+lite_option(LITE_WITH_APU  "Enable APU in lite mode"  OFF)
 lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
@@ -105,9 +132,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
+    if(WIN32)
+        set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+        FORCE)
+    else()
    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
            "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
            FORCE)
+    endif()
 endif()
 message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
@@ -129,6 +163,10 @@ if (LITE_WITH_PYTHON)
    include(external/pybind11)    # download, build, install pybind11
 endif()
+if(LITE_WITH_RKNPU)
+   include(device/rknpu)
+endif()
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
@@ -136,6 +174,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    include(cross_compiling/postproject)
    include(device/npu) # check and prepare NPU DDK
    include(device/xpu) # check and prepare XPU SDK
+    include(device/apu) # check and prepare APU SDK
    # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON
    # So the following third party dependencies are not needed.
@@ -185,6 +224,7 @@ endif()
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -209,7 +249,9 @@ include(generic)            # simplify cmake module
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(version)            # set PADDLE_VERSION
-include(flags)
+if(NOT APPLE)
+  include(flags)
+endif()
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

--- a/build.bat
+++ b/build.bat
+@echo off
+setlocal
+setlocal enabledelayedexpansion
+set source_path=%~dp0
+rem  global variables
+set BUILD_EXTRA=OFF
+set BUILD_JAVA=ON
+set BUILD_PYTHON=OFF
+set BUILD_DIR=%source_path%
+set OPTMODEL_DIR=""
+set BUILD_TAILOR=OFF
+set BUILD_CV=OFF
+set SHUTDOWN_LOG=ON  
+set THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+set workspace=%source_path%
+:set_vcvarsall_dir
+SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat   =======>"
+set tmp_var=!vcvarsall_dir!
+call:remove_space
+set vcvarsall_dir=!tmp_var!   
+IF NOT EXIST "%vcvarsall_dir%" (
+    echo "------------%vcvarsall_dir% not exist------------"
+    goto set_vcvarsall_dir
+)
+call:prepare_thirdparty
+if EXIST "%build_directory%" (
+    call:rm_rebuild_dir "%build_directory%"
+    md "%build_directory%"
+) 
+set root_dir=%workspace%
+set build_directory=%BUILD_DIR%\build.lite.x86
+set GEN_CODE_PATH_PREFIX=%build_directory%\lite\gen_code
+set DEBUG_TOOL_PATH_PREFIX=%build_directory%\lite\tools\debug
+rem for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
+rem here we fake an empty file to make cmake works.
+if NOT EXIST "%GEN_CODE_PATH_PREFIX%" (
+    md "%GEN_CODE_PATH_PREFIX%"
+)
+type nul >"%GEN_CODE_PATH_PREFIX%\__generated_code__.cc"
+if NOT EXIST "%DEBUG_TOOL_PATH_PREFIX%" (
+     md "%DEBUG_TOOL_PATH_PREFIX%"
+)
+copy "%root_dir%\lite\tools\debug\analysis_tool.py" "%DEBUG_TOOL_PATH_PREFIX%\"
+cd "%build_directory%"
+  cmake ..   -G "Visual Studio 14 2015 Win64" -T host=x64  -DWITH_MKL=ON      ^
+            -DWITH_MKLDNN=OFF   ^
+            -DLITE_WITH_X86=ON  ^
+            -DLITE_WITH_PROFILE=OFF ^
+            -DWITH_LITE=ON ^
+            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF ^
+            -DLITE_WITH_ARM=OFF ^
+            -DWITH_GPU=OFF ^
+            -DLITE_BUILD_EXTRA=ON ^
+            -DLITE_WITH_PYTHON=ON ^
+            -DPYTHON_EXECUTABLE="%python_path%"
+call "%vcvarsall_dir%" amd64
+msbuild /m /p:Configuration=Release lite\publish_inference.vcxproj >mylog.txt 2>&1
+goto:eof
+:prepare_thirdparty 
+    SET /P python_path="Please input the path of python.exe, such as C:\Python35\python.exe, C:\Python35\python3.exe   =======>"
+    set tmp_var=!python_path!
+    call:remove_space
+    set python_path=!tmp_var!   
+    if "!python_path!"=="" (
+      set python_path=python.exe
+    ) else (
+      if NOT exist "!python_path!" (
+        echo "------------!python_path! not exist------------" 
+        goto:eof
+      )  
+    )
+    if  EXIST "%workspace%\third-party" (
+        if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
+            echo "The directory of third_party exists, the third-party-05b862.tar.gz not exists."            
+        ) else (
+               echo "The directory of third_party exists, the third-party-05b862.tar.gz exists."
+               call:rm_rebuild_dir "%workspace%\third-party"
+               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        )
+    ) else (
+        if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
+            echo "The directory of third_party not exists, the third-party-05b862.tar.gz not exists."
+            call:download_third_party
+            !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        ) else (
+            echo "The directory of third_party not exists, the third-party-05b862.tar.gz exists."
+               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        )
+    )
+    git submodule update --init --recursive
+goto:eof
+:download_third_party
+powershell.exe (new-object System.Net.WebClient).DownloadFile('https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz', ^
+'%workspace%third-party-05b862.tar.gz')
+goto:eof
+:rm_rebuild_dir
+    del /f /s /q "%~1\*.*"  >nul 2>&1
+    rd /s /q  "%~1" >nul 2>&1
+goto:eof
+:remove_space
+:remove_left_space
+if "%tmp_var:~0,1%"==" " (
+    set "tmp_var=%tmp_var:~1%"
+    goto remove_left_space
+)
+:remove_right_space
+if "%tmp_var:~-1%"==" " (
+    set "tmp_var=%tmp_var:~0,-1%"
+    goto remove_left_space
+)
+goto:eof
\ No newline at end of file
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -34,6 +34,15 @@ elseif(SSE3_FOUND)
    set(SIMD_FLAG ${SSE3_FLAG})
 endif()
+if(WIN32)
+  # windows header option for all targets.
+  add_definitions(-D_XKEYCHECK_H)
+  if (NOT MSVC)
+    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
+  endif(NOT MSVC)
+endif(WIN32)
 if(LITE_WITH_CUDA)
    add_definitions(-DLITE_WITH_CUDA)
    add_definitions(-DEIGEN_USE_GPU)
@@ -70,7 +79,7 @@ endif()
 if (WITH_MKLML AND MKLML_IOMP_LIB)
    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if(WIN32)
+    if(WIN32 OR APPLE)
        # openmp not support well for now on windows
        set(OPENMP_FLAGS "")
    else(WIN32)
@@ -134,6 +143,14 @@ if (LITE_WITH_NPU)
    add_definitions("-DLITE_WITH_NPU")
 endif()
+if (LITE_WITH_APU)
+    add_definitions("-DLITE_WITH_APU")
+endif()
+if (LITE_WITH_RKNPU)
+    add_definitions("-DLITE_WITH_RKNPU")
+endif()
 if (LITE_WITH_XPU)
    add_definitions("-DLITE_WITH_XPU")
    if (LITE_WITH_XTCL)

--- a/cmake/device/apu.cmake
+++ b/cmake/device/apu.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+if(NOT DEFINED APU_DDK_ROOT)
+    set(APU_DDK_ROOT $ENV{APU_DDK_ROOT})
+    if(NOT APU_DDK_ROOT)
+        message(FATAL_ERROR "Must set APU_DDK_ROOT or env APU_DDK_ROOT when LITE_WITH_APU=ON")
+    endif()
+endif()
+message(STATUS "APU_DDK_ROOT: ${APU_DDK_ROOT}")
+find_path(APU_DDK_INC NAMES NeuronAdapter.h
+  PATHS ${APU_DDK_ROOT}/include NO_DEFAULT_PATH)
+if(NOT APU_DDK_INC)
+  message(FATAL_ERROR "Can not find NeuronAdapter.h in ${APU_DDK_ROOT}/include")
+endif()
+message(STATUS "APU_DDK_INC: ${APU_DDK_INC}")
+include_directories("${APU_DDK_ROOT}/include")
+set(APU_SUB_LIB_PATH "lib64")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(APU_SUB_LIB_PATH "lib64")
+endif()
+find_library(APU_NEURON_FILE NAMES neuron
+  PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
+find_library(APU_NEURON_ADAPTER_FILE NAMES neuron_adapter
+  PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
+if(NOT APU_NEURON_FILE)
+  message(FATAL_ERROR "Can not find APU_NEURON_FILE in ${APU_DDK_ROOT}")
+else()
+  message(STATUS "Found APU NEURON Library: ${APU_NEURON_FILE}")
+  add_library(apu_neuron SHARED IMPORTED GLOBAL)
+  set_property(TARGET apu_neuron PROPERTY IMPORTED_LOCATION ${APU_NEURON_FILE})
+endif()
+if(NOT APU_NEURON_ADAPTER_FILE)
+  message(FATAL_ERROR "Can not find APU_NEURON_ADAPTER_FILE in ${APU_DDK_ROOT}")
+else()
+  message(STATUS "Found APU NEURON ADAPTER Library: ${APU_NEURON_ADAPTER_FILE}")
+  add_library(apu_neuron_adapter SHARED IMPORTED GLOBAL)
+  set_property(TARGET apu_neuron_adapter PROPERTY IMPORTED_LOCATION ${APU_NEURON_ADAPTER_FILE})
+endif()
+set(apu_runtime_libs apu_neuron apu_neuron_adapter CACHE INTERNAL "apu runtime libs")
+message(STATUS "${apu_runtime_libs}")
--- a/cmake/device/rknpu.cmake
+++ b/cmake/device/rknpu.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+if(NOT DEFINED RKNPU_DDK_ROOT)
+    set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT})
+    if(NOT RKNPU_DDK_ROOT)
+        message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON")
+    endif()
+endif()
+message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}")
+find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h
+  PATHS ${RKNPU_DDK_ROOT}/include/  NO_DEFAULT_PATH)
+if(NOT RKNPU_DDK_INC)
+  message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include")
+endif()
+include_directories("${RKNPU_DDK_ROOT}/include")
+set(RKNPU_SUB_LIB_PATH "lib64")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(RKNPU_SUB_LIB_PATH "lib64")
+endif()
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+    set(RKNPU_SUB_LIB_PATH "lib")
+endif()
+find_library(RKNPU_DDK_FILE NAMES rknpu_ddk
+  PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH})
+if(NOT RKNPU_DDK_FILE)
+  message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}")
+else()
+  message(STATUS "Found RKNPU_DDK_FILE  Library: ${RKNPU_DDK_FILE}")
+  add_library(rknpu_ddk  SHARED IMPORTED GLOBAL)
+  set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE})
+endif()
+set(rknpu_runtime_libs rknpu_ddk  CACHE INTERNAL "rknpu ddk runtime libs")
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,12 +16,6 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})
-IF(APPLE)
-    MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
-    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
-    return()
-ENDIF()
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@@ -38,7 +32,17 @@ IF(WIN32)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
+    SET(MKLML_SHARED_LIB_DEPS     ${MKLML_LIB_DIR}/msvcr120.dll)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
+ELSEIF(APPLE)
+    #TODO(intel-huying):
+    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
+    SET(MKLML_VER "mklml_mac_2019.0.5.20190502" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlelite-data.bj.bcebos.com/third_party_libs/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml.dylib)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.dylib)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml.dylib)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.dylib)
 ELSE()
    #TODO(intel-huying):
    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -70,10 +70,10 @@ SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 SET(py_env "")
 IF(PYTHONINTERP_FOUND)
    find_python_module(pip REQUIRED)
-    find_python_module(numpy REQUIRED)
+    #find_python_module(numpy REQUIRED)
    #find_python_module(wheel REQUIRED)
    #find_python_module(google.protobuf REQUIRED)
-    FIND_PACKAGE(NumPy REQUIRED)
+    #FIND_PACKAGE(NumPy REQUIRED)
    #IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
    #    MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
    #    "please use pip to upgrade protobuf. pip install -U protobuf")

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -276,7 +276,7 @@ function(cc_library TARGET_NAME)
        add_dependencies(${TARGET_NAME} mklml)
        if(WIN32)
          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
-        else(WIN32)
+        elseif(NOT APPLE)
          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
        endif(WIN32)
      endif()

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(deps ${lite_deps_DEPS})
@@ -88,6 +88,18 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()
+  if (LITE_WITH_APU)
+    foreach(var ${lite_deps_APU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+  if (LITE_WITH_RKNPU)
+    foreach(var ${lite_deps_RKNPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
  if (LITE_WITH_XPU)
    foreach(var ${lite_deps_XPU_DEPS})
      set(deps ${deps} ${var})
@@ -131,7 +143,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -142,10 +154,12 @@ function(lite_cc_library TARGET)
            CUDA_DEPS ${args_CUDA_DEPS}
            CL_DEPS ${args_CL_DEPS}
            BM_DEPS ${args_BM_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
            CV_DEPS ${args_CV_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
+            APU_DEPS ${args_APU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -161,8 +175,10 @@ function(lite_cc_library TARGET)
    else()
        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    endif()
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
    # collect targets need to compile for lite
    if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS)
        add_dependencies(lite_compile_deps ${TARGET})
@@ -177,7 +193,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -190,8 +206,10 @@ function(lite_cc_binary TARGET)
            ARM_DEPS ${args_ARM_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
+            APU_DEPS ${args_APU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
-	    BM_DEPS ${args_BM_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
+            BM_DEPS ${args_BM_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
@@ -199,7 +217,9 @@ function(lite_cc_binary TARGET)
            MLU_DEPS ${args_MLU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
    if (NOT APPLE)
        # strip binary target to reduce size
        if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
@@ -226,7 +246,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -247,8 +267,10 @@ function(lite_cc_test TARGET)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -263,7 +285,9 @@ function(lite_cc_test TARGET)
                "${TARGET}"
                COMMENT "Strip debug symbols done on final executable file.")
    endif()
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
    file(APPEND ${offline_test_registry_file} "${TARGET}\n")
    # collect targets need to compile for lite
@@ -277,9 +301,11 @@ set(x86_kernels CACHE INTERNAL "x86 kernels")
 set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
+set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
+set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")
@@ -295,12 +321,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
+# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -323,6 +349,12 @@ function(add_kernel TARGET device level)
    if ("${device}" STREQUAL "Host")
+       if (LITE_ON_MODEL_OPTIMIZE_TOOL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
        set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "ARM")
@@ -352,6 +384,15 @@ function(add_kernel TARGET device level)
        endif()
        set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "APU")
+        if (NOT LITE_WITH_APU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(apu_kernels "${apu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "XPU")
        if (NOT LITE_WITH_XPU)
            foreach(src ${args_SRCS})
@@ -379,6 +420,15 @@ function(add_kernel TARGET device level)
        endif()
        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "RKNPU")
+        if (NOT LITE_WITH_RKNPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "MLU")
        if (NOT LITE_WITH_MLU)
            foreach(src ${args_SRCS})
@@ -426,8 +476,10 @@ function(add_kernel TARGET device level)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -447,7 +499,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -480,8 +532,10 @@ function(add_operator TARGET level)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -489,6 +543,29 @@ function(add_operator TARGET level)
      )
 endfunction()
+#only for windows 
+function(create_static_lib TARGET_NAME)
+  set(libs ${ARGN})
+  list(REMOVE_DUPLICATES libs)
+    set(dummy_index 1)
+    set(dummy_offset 1)
+    # the dummy target would be consisted of limit size libraries
+    set(dummy_limit 60)
+    list(LENGTH libs libs_len)
+    foreach(lib ${libs})
+      list(APPEND dummy_list ${lib})
+      list(LENGTH dummy_list listlen)
+      if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len}))
+        merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list})
+        set(dummy_list)
+        list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index})
+        MATH(EXPR dummy_index "${dummy_index}+1")
+      endif()
+      MATH(EXPR dummy_offset "${dummy_offset}+1")
+    endforeach()
+    merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list})
+endfunction()
 # Bundle several static libraries into one.
 function(bundle_static_library tgt_name bundled_tgt_name fake_target)
@@ -532,7 +609,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target)
  set(bundled_tgt_full_name
    ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX})
-  #message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}")
+  message(STATUS "bundled_tgt_full_name:  ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  if(WIN32)
+    set(dummy_tgt_name dummy_${bundled_tgt_name})
+    create_static_lib(${bundled_tgt_name} ${static_libs})
+    add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_name})
+    add_dependencies(${fake_target} ${tgt_name})
+    add_library(${dummy_tgt_name} STATIC IMPORTED)
+    set_target_properties(${dummy_tgt_name}
+      PROPERTIES
+        IMPORTED_LOCATION ${bundled_tgt_full_name}
+        INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:${tgt_name},INTERFACE_INCLUDE_DIRECTORIES>)
+    add_dependencies(${dummy_tgt_name} ${fake_target})
+    return()
+  endif()
  if(NOT IOS)
    file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -7,7 +7,9 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
 message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
+message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
+message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
 message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
@@ -70,12 +72,18 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
    if (LITE_WITH_XPU)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu")
    endif(LITE_WITH_XPU)
+    if (LITE_WITH_APU)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.apu")
+    endif(LITE_WITH_APU)
    if (LITE_WITH_FPGA)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
    endif(LITE_WITH_FPGA)
    if (LITE_WITH_BM)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
    endif(LITE_WITH_BM)
+    if (LITE_WITH_RKNPU)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu")
+    endif(LITE_WITH_RKNPU)
 else()
    set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
 endif()
@@ -83,14 +91,57 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
 # add python lib
 if (LITE_WITH_PYTHON)
-    add_custom_target(publish_inference_python_lib ${TARGET}
+    if(WIN32)   
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+        set(LITE_CORE "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd")
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+        set(LITE_CORE_DEPS ${LITE_CORE})
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+        add_custom_command(OUTPUT   ${LITE_CORE}
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+            COMMAND cmake -E copy $<TARGET_FILE:lite_pybind> ${LITE_CORE}
-            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            DEPENDS lite_pybind)
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+        add_custom_target(copy_lite_pybind ALL DEPENDS ${LITE_CORE_DEPS})
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+        add_custom_target(publish_inference_python_lib ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.pyd"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.pyd"
+            DEPENDS copy_lite_pybind
+            )
+        add_custom_target(publish_inference_python_installer ${TARGET}
+            COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
+            DEPENDS  publish_inference_python_lib)
+        add_custom_target(publish_inference_python_light_demo ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/python"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_full_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
+            )
+        add_dependencies(publish_inference publish_inference_python_lib)
+        add_dependencies(publish_inference publish_inference_python_installer)
+        add_dependencies(publish_inference publish_inference_python_light_demo)
+    else()
+    if(APPLE)
+        add_custom_target(publish_inference_python_lib ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    else()
+        add_custom_target(publish_inference_python_lib ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    endif()
    add_custom_target(publish_inference_python_installer ${TARGET}
        COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
        WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
@@ -108,30 +159,78 @@ if (LITE_WITH_PYTHON)
    add_dependencies(publish_inference publish_inference_python_lib)
    add_dependencies(publish_inference publish_inference_python_installer)
    add_dependencies(publish_inference publish_inference_python_light_demo)
+    endif(WIN32)
 endif()
 if (LITE_WITH_CUDA OR LITE_WITH_X86)
-    add_custom_target(publish_inference_cxx_lib ${TARGET}
+    if(APPLE)
-        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        add_custom_target(publish_inference_cxx_lib ${TARGET}
-        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
-        COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-        COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-        COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.dylib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-        COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            )
-        )
+        add_custom_target(publish_inference_third_party ${TARGET}
-    add_custom_target(publish_inference_third_party ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+                COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
-            COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+        add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
-    add_dependencies(publish_inference_cxx_lib bundle_full_api)
+        add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
-    add_dependencies(publish_inference_cxx_lib bundle_light_api)
+        add_dependencies(publish_inference publish_inference_cxx_lib)
-    add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
+        add_dependencies(publish_inference publish_inference_third_party)
-    add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
+    elseif(NOT WIN32)
-    add_dependencies(publish_inference publish_inference_cxx_lib)
+        add_custom_target(publish_inference_cxx_lib ${TARGET}
-    add_dependencies(publish_inference publish_inference_third_party)
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            )
+        add_custom_target(publish_inference_third_party ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+                COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+        add_dependencies(publish_inference_cxx_lib bundle_full_api)
+        add_dependencies(publish_inference_cxx_lib bundle_light_api)
+        add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
+        add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
+        add_dependencies(publish_inference publish_inference_cxx_lib)
+        add_dependencies(publish_inference publish_inference_third_party)
+    endif()
 endif()
 if (LITE_WITH_X86)
+  if(WIN32)
+        add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_place.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_passes.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_lite_factory_helper.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_full_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_light_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        )
+        add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
+        add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
+        add_dependencies(publish_inference publish_inference_x86_cxx_lib)
+        add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+        )
+        add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
+        add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3)
+  else()
    add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
@@ -146,6 +245,7 @@ if (LITE_WITH_X86)
    add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
    add_dependencies(publish_inference publish_inference_x86_cxx_lib)
    add_dependencies(publish_inference publish_inference_x86_cxx_demos)
+  endif()
 endif()
 if(LITE_WITH_CUDA)

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -23,6 +23,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
            add_dependencies(paddle_full_api_shared dynload_mklml)
        endif()
+        if(WIN32)
+             target_link_libraries(paddle_full_api_shared shlwapi.lib)
+        endif()
    endif()
    if(LITE_WITH_CUDA)
        target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
@@ -34,15 +37,20 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                  ARM_DEPS ${arm_kernels}
                  CV_DEPS paddle_cv_arm
                  NPU_DEPS ${npu_kernels}
+                  APU_DEPS ${apu_kernels}
+                  RKNPU_DEPS ${rknpu_kernels}
                  )
    add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
-    set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
+    if(NOT APPLE AND NOT WIN32)
-    set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
+        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
+        set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
-    add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
+        add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
-    set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
+        add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
-    add_dependencies(paddle_full_api_shared custom_linker_map)
+        set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
+        add_dependencies(paddle_full_api_shared custom_linker_map)
+   endif()
 else()
    if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
        add_library(paddle_light_api_shared SHARED "")
@@ -57,6 +65,11 @@ else()
            # Need to add HIAI runtime libs (libhiai.so) dependency
            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
        endif()
+        if (LITE_WITH_RKNPU)
+            # Need to add RKNPU runtime libs dependency
+            target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs})
+        endif()
    endif()
 endif()
@@ -67,8 +80,11 @@ if (WITH_TESTING)
      CUDA_DEPS ${cuda_kernels}
      X86_DEPS ${x86_kernels}
      XPU_DEPS ${xpu_kernels}
+      RKNPU_DEPS ${rknpu_kernels}
      BM_DEPS ${bm_kernels}
-      MLU_DEPS ${mlu_kernels})
+      MLU_DEPS ${mlu_kernels}
+      APU_DEPS ${apu_kernels})
 endif()
 if(LITE_WITH_FPGA)
    set(light_api_deps ${light_api_deps} ${fpga_deps})
@@ -80,6 +96,12 @@ if(LITE_WITH_BM)
    set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
 endif()
+if(LITE_WITH_RKNPU)
+    set(light_api_deps ${light_api_deps} ${rknpu_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
+endif()
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
 message(STATUS "get CUDA kernels ${cuda_kernels}")
@@ -87,7 +109,9 @@ message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get OpenCL kernels ${opencl_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
+message(STATUS "get APU kernels ${apu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
+message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
 message(STATUS "get MLU kernels ${mlu_kernels}")
@@ -105,6 +129,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        CV_DEPS paddle_cv_arm
                        NPU_DEPS ${npu_kernels}
                        XPU_DEPS ${xpu_kernels}
+                        APU_DEPS ${apu_kernels}
+                        RKNPU_DEPS ${rknpu_kernels}
                        BM_DEPS ${bm_kernels}
                        CL_DEPS ${opencl_kernels}
                        FPGA_DEPS ${fpga_kernels})
@@ -125,7 +151,9 @@ lite_cc_library(light_api SRCS light_api.cc
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
        XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
@@ -144,7 +172,9 @@ if(WITH_TESTING)
       ARM_DEPS ${arm_kernels}
       CV_DEPS paddle_cv_arm
       NPU_DEPS ${npu_kernels}
+       APU_DEPS ${apu_kernels}
       XPU_DEPS ${xpu_kernels}
+       RKNPU_DEPS ${rknpu_kernels}
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels}
       BM_DEPS ${bm_kernels}
@@ -200,7 +230,7 @@ if(WITH_TESTING)
 endif()
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
-    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${fpga_kernels})
+    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels})
    lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc
       DEPS ${lite_model_test_DEPS}
@@ -246,6 +276,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
            --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
    add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
   # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
   # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
   #    DEPS ${lite_model_test_DEPS})
@@ -271,6 +302,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels})
@@ -289,6 +321,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
        DEPS light_api program mir_passes paddle_api_light
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        BM_DEPS ${bm_kernels}
        ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -298,6 +331,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
        X86_DEPS ${x86_kernels}
        XPU_DEPS ${xpu_kernels}
        FPGA_DEPS ${fpga_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        BM_DEPS ${bm_kernels}
        MLU_DEPS ${mlu_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
@@ -333,6 +367,8 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
  CV_DEPS paddle_cv_arm
  NPU_DEPS ${npu_kernels}
  XPU_DEPS ${xpu_kernels}
+  APU_DEPS ${apu_kernels}
+  RKNPU_DEPS ${rknpu_kernels}
  CL_DEPS ${opencl_kernels}
  X86_DEPS ${x86_kernels}
  FPGA_DEPS ${fpga_kernels}
@@ -352,8 +388,10 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -365,8 +403,10 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -378,8 +418,10 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -390,7 +432,9 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
@@ -401,19 +445,24 @@ if(NOT IOS)
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
        XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
 	BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
    lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        RKNPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -13,7 +13,13 @@
 // limitations under the License.
 #include <gflags/gflags.h>
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#include <windows.h>
+#include "lite/backends/x86/port.h"
+#endif
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include <time.h>
 #include <algorithm>
 #include <cstdio>

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -20,12 +20,15 @@
 #include "lite/core/device_info.h"
 #include "lite/core/version.h"
+#ifndef LITE_ON_TINY_PUBLISH
+#include "lite/api/paddle_use_passes.h"
+#endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
-    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
 #include <omp.h>
 #include "lite/backends/x86/mklml.h"
 #endif
 namespace paddle {
 namespace lite {
@@ -67,9 +70,8 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  raw_predictor_.Build(config, places, passes);
  mode_ = config.power_mode();
  threads_ = config.threads();
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
-    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
  int num_threads = config.x86_math_library_num_threads();
  int real_num_threads = num_threads > 1 ? num_threads : 1;
  paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -82,7 +82,7 @@ Tensor* LightPredictor::GetInputByName(const std::string& name) {
  if (element == input_names_.end()) {
    LOG(ERROR) << "Model do not have input named with: [" << name
               << "], model's inputs include:";
-    for (int i = 0; i < input_names_.size(); i++) {
+    for (size_t i = 0; i < input_names_.size(); i++) {
      LOG(ERROR) << "[" << input_names_[i] << "]";
    }
    return nullptr;
@@ -114,7 +114,7 @@ void LightPredictor::PrepareFeedFetch() {
  auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
  std::vector<cpp::OpDesc*> feeds;
  std::vector<cpp::OpDesc*> fetchs;
-  for (int i = 0; i < current_block->OpsSize(); i++) {
+  for (size_t i = 0; i < current_block->OpsSize(); i++) {
    auto op = current_block->GetOp<cpp::OpDesc>(i);
    if (op->Type() == "feed") {
      feeds.push_back(op);
@@ -124,11 +124,11 @@ void LightPredictor::PrepareFeedFetch() {
  }
  input_names_.resize(feeds.size());
  output_names_.resize(fetchs.size());
-  for (int i = 0; i < feeds.size(); i++) {
+  for (size_t i = 0; i < feeds.size(); i++) {
    input_names_[feeds[i]->GetAttr<int>("col")] =
        feeds[i]->Output("Out").front();
  }
-  for (int i = 0; i < fetchs.size(); i++) {
+  for (size_t i = 0; i < fetchs.size(); i++) {
    output_names_[fetchs[i]->GetAttr<int>("col")] =
        fetchs[i]->Input("X").front();
  }

--- a/lite/api/light_api_test.cc
+++ b/lite/api/light_api_test.cc
@@ -37,11 +37,11 @@ TEST(LightAPI, load) {
  const std::vector<std::string> inputs = predictor.GetInputNames();
  LOG(INFO) << "input size: " << inputs.size();
-  for (int i = 0; i < inputs.size(); i++) {
+  for (size_t i = 0; i < inputs.size(); i++) {
    LOG(INFO) << "inputnames: " << inputs[i];
  }
  const std::vector<std::string> outputs = predictor.GetOutputNames();
-  for (int i = 0; i < outputs.size(); i++) {
+  for (size_t i = 0; i < outputs.size(); i++) {
    LOG(INFO) << "outputnames: " << outputs[i];
  }

--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
@@ -293,13 +293,13 @@ int main(int argc, char** argv) {
  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
  std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
    input_shapes.push_back(get_shape(str_input_shapes[i]));
  }
  std::vector<std::string> str_input_shapes_0 =
      split_string(FLAGS_input_shape_0);
  std::vector<std::vector<int64_t>> input_shapes_0;
-  for (int i = 0; i < str_input_shapes_0.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes_0.size(); ++i) {
    input_shapes_0.push_back(get_shape(str_input_shapes_0[i]));
  }

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -44,9 +44,15 @@ void OutputOptModel(const std::string& load_model_dir,
                    const std::vector<std::vector<int64_t>>& input_shapes) {
  lite_api::CxxConfig config;
  config.set_model_dir(load_model_dir);
+#ifdef LITE_WITH_X86
+  config.set_valid_places({Place{TARGET(kX86), PRECISION(kFloat)},
+                           Place{TARGET(kX86), PRECISION(kInt64)},
+                           Place{TARGET(kHost), PRECISION(kFloat)}});
+#else
  config.set_valid_places({
      Place{TARGET(kARM), PRECISION(kFloat)},
  });
+#endif
  auto predictor = lite_api::CreatePaddlePredictor(config);
  // delete old optimized model
@@ -198,7 +204,7 @@ int main(int argc, char** argv) {
  LOG(INFO) << "input shapes: " << FLAGS_input_shape;
  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
  std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
    LOG(INFO) << "input shape: " << str_input_shapes[i];
    input_shapes.push_back(get_shape(str_input_shapes[i]));
  }

--- a/lite/api/model_test_classify.cc
+++ b/lite/api/model_test_classify.cc
@@ -310,7 +310,7 @@ int main(int argc, char** argv) {
  LOG(INFO) << "input shapes: " << FLAGS_input_shape;
  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
  std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
    LOG(INFO) << "input shape: " << str_input_shapes[i];
    input_shapes.push_back(get_shape(str_input_shapes[i]));
  }

--- a/lite/api/model_test_detection.cc
+++ b/lite/api/model_test_detection.cc
@@ -114,7 +114,7 @@ void detect_object(const float* dout,
  }
  std::string name = FLAGS_out_txt + "_accu.txt";
  FILE* fp = fopen(name.c_str(), "w");
-  for (int i = 0; i < objects.size(); ++i) {
+  for (size_t i = 0; i < objects.size(); ++i) {
    Object object = objects.at(i);
    if (object.prob > thresh && object.x > 0 && object.y > 0 &&
        object.width > 0 && object.height > 0) {
@@ -324,7 +324,7 @@ int main(int argc, char** argv) {
  LOG(INFO) << "input shapes: " << FLAGS_input_shape;
  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
  std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
    LOG(INFO) << "input shape: " << str_input_shapes[i];
    input_shapes.push_back(get_shape(str_input_shapes[i]));
  }

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -104,13 +104,21 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(
          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
    } else if (target_repr == "x86") {
-      valid_places.emplace_back(TARGET(kX86));
+      valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kFloat)});
+      valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
    } else if (target_repr == "npu") {
      valid_places.emplace_back(TARGET(kNPU));
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
    } else if (target_repr == "mlu") {
      valid_places.emplace_back(TARGET(kMLU));
+    } else if (target_repr == "rknpu") {
+      valid_places.emplace_back(TARGET(kRKNPU));
+      valid_places.emplace_back(
+          TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
+    } else if (target_repr == "apu") {
+      valid_places.emplace_back(
+          Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "
@@ -187,6 +195,8 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                      "kFPGA",
                                      "kNPU",
                                      "kXPU",
+                                      "kRKNPU",
+                                      "kAPU",
                                      "kAny",
                                      "kUnk"};
  int maximum_optype_length = 0;
@@ -251,16 +261,16 @@ void PrintHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
      "  Display operators in the input model\n";
  std::cout << "opt version:" << opt_version << std::endl
            << help_info << std::endl;

--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -63,6 +63,13 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
      valid_places_.emplace_back(TARGET(kNPU));
    } else if (target_repr == "xpu") {
      valid_places_.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "rknpu") {
+      valid_places_.emplace_back(TARGET(kRKNPU));
+      valid_places_.emplace_back(
+          TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
+    } else if (target_repr == "apu") {
+      valid_places_.emplace_back(
+          Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "
@@ -183,7 +190,7 @@ void OptBase::PrintHelpInfo() {
      "        `set_param_file(param_file_path)`\n"
      "        `set_model_type(protobuf|naive_buffer)`\n"
      "        `set_optimize_out(output_optimize_model_dir)`\n"
-      "        `set_valid_places(arm|opencl|x86|npu|xpu)`\n"
+      "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
      "        `run_optimize(false|true)`\n"
      "        `  ----fasle&true refer to whether to record ops info for "
      "tailoring lib, false by default`\n"
@@ -208,6 +215,8 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
                                                     "kFPGA",
                                                     "kNPU",
                                                     "kXPU",
+                                                     "kRKNPU",
+                                                     "kAPU",
                                                     "kAny",
                                                     "kUnk"};
  // Get the lengh of the first column: maximum length of the op_type

--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -36,11 +36,11 @@ TEST(CxxApi, run) {
  auto inputs = predictor->GetInputNames();
  LOG(INFO) << "input size: " << inputs.size();
-  for (int i = 0; i < inputs.size(); i++) {
+  for (size_t i = 0; i < inputs.size(); i++) {
    LOG(INFO) << "inputnames: " << inputs[i];
  }
  auto outputs = predictor->GetOutputNames();
-  for (int i = 0; i < outputs.size(); i++) {
+  for (size_t i = 0; i < outputs.size(); i++) {
    LOG(INFO) << "outputnames: " << outputs[i];
  }
  auto input_tensor = predictor->GetInputByName(inputs[0]);

--- a/lite/api/paddle_lite_factory_helper.h
+++ b/lite/api/paddle_lite_factory_helper.h
@@ -18,20 +18,21 @@
 */
 #pragma once
-#define USE_LITE_OP(op_type__)                                   \
+// some platform-independent defintion
-  extern int touch_op_##op_type__();                             \
+#include "lite/utils/macros.h"
-  int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \
-      touch_op_##op_type__();
+#define USE_LITE_OP(op_type__)       \
+  extern int touch_op_##op_type__(); \
+  int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__();
 #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
  extern int touch_##op_type__##target__##precision__##layout__##alias__();  \
  int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \
-      __attribute__((unused)) =                                              \
+      UNUSED = touch_##op_type__##target__##precision__##layout__##alias__();
-          touch_##op_type__##target__##precision__##layout__##alias__();
-#define USE_MIR_PASS(name__)                                   \
+#define USE_MIR_PASS(name__)                      \
-  extern bool mir_pass_registry##name__##_fake();              \
+  extern bool mir_pass_registry##name__##_fake(); \
-  static bool mir_pass_usage##name__ __attribute__((unused)) = \
+  static bool mir_pass_usage##name__ UNUSED =     \
      mir_pass_registry##name__##_fake();
 #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -72,7 +72,9 @@ const std::string& TargetToStr(TargetType target) {
                                              "npu",
                                              "xpu",
                                              "bm",
-                                              "mlu"};
+                                              "mlu",
+                                              "rknpu",
+                                              "apu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -112,8 +114,10 @@ const std::string& TargetRepr(TargetType target) {
                                              "kFPGA",
                                              "kNPU",
                                              "kXPU",
+                                              "kBM",
                                              "kMLU",
-                                              "kBM"};
+                                              "kRKNPU",
+                                              "kAPU"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -156,6 +160,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kXPU),
                                               TARGET(kBM),
                                               TARGET(kMLU),
+                                               TARGET(kAPU),
                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -49,13 +49,15 @@ enum class TargetType : int {
  kCUDA = 3,
  kARM = 4,
  kOpenCL = 5,
+  kAny = 6,  // any target
  kFPGA = 7,
  kNPU = 8,
  kXPU = 9,
  kBM = 10,
  kMLU = 11,
-  kAny = 6,  // any target
+  kRKNPU = 12,
-  NUM = 12,  // number of fields.
+  kAPU = 13,
+  NUM = 14,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -49,6 +49,7 @@ USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(mlu_subgraph_pass);
 USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
+USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
--- a/lite/api/python/CMakeLists.txt
+++ b/lite/api/python/CMakeLists.txt
@@ -17,8 +17,12 @@ execute_process(
  OUTPUT_VARIABLE PADDLE_LITE_COMMIT
  OUTPUT_STRIP_TRAILING_WHITESPACE
 )
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+if(APPLE)
-    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup_mac.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+else()
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+endif()
 add_subdirectory(pybind)
 #add_subdirectory(interface)
--- a/lite/api/python/__init__.py
+++ b/lite/api/python/__init__.py
@@ -11,3 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os 
+import sys
+if os.name =='nt':
+    current_path = os.path.abspath(os.path.dirname(__file__))
+    third_lib_path = current_path + os.sep + 'libs'
+    os.environ['path'] =  third_lib_path+ ';' + os.environ['path']
+    sys.path.insert(0, third_lib_path)
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -3,7 +3,14 @@ if (NOT LITE_ON_TINY_PUBLISH)
   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
 endif()
-lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+if(WIN32)
+   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+   target_link_libraries(lite_pybind ${os_dependency_modules})
+else()
+   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+endif(WIN32)
 if (LITE_ON_TINY_PUBLISH)
   set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
 endif()
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -183,6 +183,8 @@ void BindLitePlace(py::module *m) {
      .value("FPGA", TargetType::kFPGA)
      .value("NPU", TargetType::kNPU)
      .value("MLU", TargetType::kMLU)
+      .value("RKNPU", TargetType::kRKNPU)
+      .value("APU", TargetType::kAPU)
      .value("Any", TargetType::kAny);
  // PrecisionType

--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
@@ -34,20 +34,27 @@ else:
 # core lib of paddlelite is stored as lite.so
 LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
-PACKAGE_DATA = {'paddlelite': ['lite.so']}
+PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']}
 # put all thirdparty libraries in paddlelite.libs
 PACKAGE_DATA['paddlelite.libs'] = []
 LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
 if '${WITH_MKL}' == 'ON':
    shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
    shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
-    PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
+    if os.name != 'nt':
+        PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
+    else:
+        PACKAGE_DATA['paddlelite.libs'] += ['libiomp5md.dll', 'mklml.dll']
+        shutil.copy('${MKLML_SHARED_LIB_DEPS}', LIB_PATH)
+        PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll']
 # link lite.so to paddlelite.libs
-COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
+if os.name != 'nt':
-/inference_lite_lib/python/install/lite/lite.so"
+    COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
-if os.system(COMMAND) != 0:
+    /inference_lite_lib/python/install/lite/lite.so"
-    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+    if os.system(COMMAND) != 0:
+        raise Exception("patch third_party libs failed, command: %s" % COMMAND)
 # remove unused paddle/libs/__init__.py
 if os.path.isfile(LIB_PATH+'/__init__.py'):
@@ -61,6 +68,14 @@ PACKAGE_DIR = {
    'paddlelite': LITE_PATH
 }
+if os.name == 'nt':
+    # fix the path separator under windows
+    fix_package_dir = {}
+    for k, v in PACKAGE_DIR.items():
+        fix_package_dir[k] = v.replace('/', '\\')
+    PACKAGE_DIR = fix_package_dir
 setup(
    name='paddlelite',
    version=PADDLELITE_VERSION,

--- a/lite/api/python/setup_mac.py.in
+++ b/lite/api/python/setup_mac.py.in
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# module of pack whl installer for Paddle-lite
+import shutil
+import os
+from setuptools import setup, Distribution
+class BinaryDistribution(Distribution):
+    'binary distribution'
+    def has_ext_modules(foo):
+        return True
+# get paddle-lite version, if it's not based on a release tag, we use commit id instead
+PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
+PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
+if PADDLELITE_TAG == "":
+    PADDLELITE_VERSION = PADDLELITE_COMMITE
+else:
+    PADDLELITE_VERSION = PADDLELITE_TAG
+# core lib of paddlelite is stored as lite.so
+LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
+PACKAGE_DATA = {'paddlelite': ['lite.so']}
+# put all thirdparty libraries in paddlelite.libs
+PACKAGE_DATA['paddlelite.libs'] = []
+LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
+    shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
+    PACKAGE_DATA['paddlelite.libs'] += ['libmklml.dylib', 'libiomp5.dylib']
+# link lite.so to paddlelite.libs
+COMMAND = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}\
+/inference_lite_lib/python/install/lite/lite.so"
+if os.system(COMMAND) != 0:
+    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+# remove unused paddle/libs/__init__.py
+if os.path.isfile(LIB_PATH+'/__init__.py'):
+    os.remove(LIB_PATH+'/__init__.py')
+# set dir path of each package
+PACKAGE_DIR = {
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddlelite.libs': LIB_PATH,
+    'paddlelite': LITE_PATH
+}
+setup(
+    name='paddlelite',
+    version=PADDLELITE_VERSION,
+    description='Paddle-Lite Library',
+    packages=['paddlelite', 'paddlelite.libs'],
+    package_dir=PACKAGE_DIR,
+    package_data=PACKAGE_DATA,
+    distclass=BinaryDistribution
+)
--- a/lite/api/test_googlenet_lite.cc
+++ b/lite/api/test_googlenet_lite.cc
@@ -38,7 +38,7 @@ TEST(CXXApi, test_lite_googlenet) {
  input_tensor->Resize(input_shape);
  auto* data = input_tensor->mutable_data<float>();
  int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
    input_num *= input_shape[i];
  }
  for (int i = 0; i < input_num; i++) {
@@ -69,7 +69,7 @@ TEST(CXXApi, test_lite_googlenet) {
  for (size_t i = 0; i < results.size(); ++i) {
    EXPECT_NEAR(out->data<float>()[i * 51], results[i], 1e-5);
  }
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
  ASSERT_EQ(out->shape()[0], 1);
  ASSERT_EQ(out->shape()[1], 1000);
 }

--- a/lite/api/test_helper.h
+++ b/lite/api/test_helper.h
@@ -15,7 +15,12 @@
 #pragma once
 #include <gflags/gflags.h>
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#include <windows.h>
+#include "lite/backends/x86/port.h"
+#endif
 #include <time.h>
 #include <cmath>

--- a/lite/api/test_inceptionv4_lite_x86.cc
+++ b/lite/api/test_inceptionv4_lite_x86.cc
@@ -38,7 +38,7 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) {
  input_tensor->Resize(input_shape);
  auto* data = input_tensor->mutable_data<float>();
  int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
    input_num *= input_shape[i];
  }
  for (int i = 0; i < input_num; i++) {
@@ -69,13 +69,13 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) {
       0.0010612885,  0.00089107914, 0.0010112736,  0.00097655767}));
  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
  ASSERT_EQ(out->shape()[0], 1);
  ASSERT_EQ(out->shape()[1], 1000);
  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
+  for (size_t i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                  results[i][j],
                  1e-6);

--- a/lite/api/test_mobilenetv1_lite_x86.cc
+++ b/lite/api/test_mobilenetv1_lite_x86.cc
@@ -38,7 +38,7 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
  input_tensor->Resize(input_shape);
  auto* data = input_tensor->mutable_data<float>();
  int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
    input_num *= input_shape[i];
  }
  for (int i = 0; i < input_num; i++) {
@@ -68,13 +68,13 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
       0.0048292773,  0.0013995157,  0.0018453331,  0.0002428986,
       0.00020211363, 0.00013668182, 0.0005855956,  0.00025901722}));
  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
  ASSERT_EQ(out->shape()[0], 1);
  ASSERT_EQ(out->shape()[1], 1000);
  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
+  for (size_t i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                  results[i][j],
                  1e-6);

--- a/lite/api/test_mobilenetv2_lite_x86.cc
+++ b/lite/api/test_mobilenetv2_lite_x86.cc
@@ -39,7 +39,7 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
  input_tensor->Resize(input_shape);
  auto* data = input_tensor->mutable_data<float>();
  int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
    input_num *= input_shape[i];
  }
  for (int i = 0; i < input_num; i++) {
@@ -69,13 +69,13 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
       0.0070957416,  0.0016094646,  0.0018807327,  0.00010506048,
       6.823785e-05,  0.00012269315, 0.0007806194,  0.00022354358}));
  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
  ASSERT_EQ(out->shape()[0], 1);
  ASSERT_EQ(out->shape()[1], 1000);
  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
+  for (size_t i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                  results[i][j],
                  1e-6);

--- a/lite/api/test_resnet50_lite_x86.cc
+++ b/lite/api/test_resnet50_lite_x86.cc
@@ -38,7 +38,7 @@ TEST(Resnet50, test_resnet50_lite_x86) {
  input_tensor->Resize(input_shape);
  auto* data = input_tensor->mutable_data<float>();
  int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
    input_num *= input_shape[i];
  }
  for (int i = 0; i < input_num; i++) {
@@ -69,13 +69,13 @@ TEST(Resnet50, test_resnet50_lite_x86) {
       0.006387163,   0.0037145028,  0.0012812682,  0.00045948103,
       0.00013535398, 0.0002483765,  0.00076759676, 0.0002773295}));
  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
  ASSERT_EQ(out->shape()[0], 1);
  ASSERT_EQ(out->shape()[1], 1000);
  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
+  for (size_t i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                  results[i][j],
                  1e-6);

--- a/lite/api/transform_test.cc
+++ b/lite/api/transform_test.cc
@@ -232,8 +232,8 @@ void TestModel(const std::vector<Place>& valid_places,
    for (int i = 0; i < outs->numel(); ++i) {
      LOG(INFO) << o_data[i];
    }
-    for (int i = 0; i < lod.size(); ++i) {
+    for (size_t i = 0; i < lod.size(); ++i) {
-      for (int j = 0; j < lod[i].size(); ++j) {
+      for (size_t j = 0; j < lod[i].size(); ++j) {
        LOG(INFO) << lod[i][j];
      }
    }

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -8,3 +8,5 @@ add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
 add_subdirectory(bm)
+add_subdirectory(apu)
+add_subdirectory(rknpu)
--- a/lite/backends/apu/CMakeLists.txt
+++ b/lite/backends/apu/CMakeLists.txt
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+lite_cc_library(device_apu SRCS device.cc)
--- a/lite/backends/apu/device.cc
+++ b/lite/backends/apu/device.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/apu/device.h"
+#include <dlfcn.h>
+#include "lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+namespace apu {
+inline void* LoadFunc(void* libHandle, const char* name) {
+  CHECK(libHandle != nullptr);
+  CHECK(name != nullptr);
+  void* fn = dlsym(libHandle, name);
+  if (fn == nullptr) {
+    LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
+                 << "] Because " << dlerror();
+  }
+  return fn;
+}
+NeuronCompilation* Device::Build(void* libHandle, NeuronModel* model) {
+  typedef int (*NeuronCompilation_create)(NeuronModel * model,
+                                          NeuronCompilation * *compilation);
+  typedef void (*NeuronCompilation_free)(NeuronCompilation * compilation);
+  typedef int (*NeuronCompilation_finish)(NeuronCompilation * compilation);
+#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
+  FUNC_NAME VARIABLE_NAME =                                 \
+      reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_create, neuron_compilation_create)
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_free, neuron_compilation_free)
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_finish, neuron_compilation_finish)
+#undef LOAD_FUNCTIONS
+  int neuron_errCode = 0;
+  NeuronCompilation* compilation = NULL;
+  VLOG(3) << "[APU] Compile model";
+  neuron_errCode = (*neuron_compilation_create)(model, &compilation);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "[APU] create compile failed! " << neuron_errCode;
+    return nullptr;
+  }
+  neuron_errCode = (*neuron_compilation_finish)(compilation);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "[APU] compile failed! " << neuron_errCode;
+    return nullptr;
+  }
+  VLOG(3) << "[APU] Build done";
+  return compilation;
+}
+}  // namespace apu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/apu/device.h
+++ b/lite/backends/apu/device.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "NeuronAdapter.h"  // NOLINT
+namespace paddle {
+namespace lite {
+namespace apu {
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {}
+  NeuronCompilation* Build(void* libHandle, NeuronModel* model);
+};
+}  // namespace apu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/reduce_mean.cc
+++ b/lite/backends/arm/math/reduce_mean.cc
@@ -198,6 +198,23 @@ void reduce_mean_hw<float>(const float* src,
  reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
 }
+template <>
+void mean_grad<float>(const float* out_grad, float* in_grad, int size) {
+  float grad = out_grad[0] / size;
+  float32x4_t grad_v = vdupq_n_f32(grad);
+  int loop = size >> 2;
+  int remain = size & 3;
+#pragma omp parallel for
+  for (int i = 0; i < loop; ++i) {
+    vst1q_f32(in_grad, grad_v);
+    in_grad += 4;
+  }
+  for (int i = 0; i < remain; ++i) {
+    in_grad[i] = grad;
+  }
+}
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/reduce_mean.h
+++ b/lite/backends/arm/math/reduce_mean.h
@@ -83,6 +83,9 @@ void reduce_mean_all(const T* src,
                     int height_in,
                     int width_in);
+template <typename T>
+void mean_grad(const T* out_grad, T* in_grad, int size);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl
@@ -54,10 +54,10 @@ __kernel void bilinear_interp(__read_only image2d_t input,
    if (ceil_h > in_dims_h - 1) {
        ceil_h = in_dims_h- 1;
    }
-    float wight0_w = center_w - floor_w;
+    CL_DTYPE wight0_w = center_w - floor_w;
-    float wight0_h = center_h - floor_h;
+    CL_DTYPE wight0_h = center_h - floor_h;
-    float wight1_w = 1.0 - wight0_w;
+    CL_DTYPE wight1_w = 1.0 - wight0_w;
-    float wight1_h = 1.0 - wight0_h;
+    CL_DTYPE wight1_h = 1.0 - wight0_h;
    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                            CLK_ADDRESS_CLAMP |
@@ -92,5 +92,6 @@ __kernel void bilinear_interp(__read_only image2d_t input,
    CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h
            + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h;
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out);
 }
--- a/lite/backends/rknpu/CMakeLists.txt
+++ b/lite/backends/rknpu/CMakeLists.txt
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs})
--- a/lite/backends/rknpu/device.cc
+++ b/lite/backends/rknpu/device.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/rknpu/device.h"
+#include "lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+namespace rknpu {
+std::unique_ptr<rk::nn::Exection> Device::Build(
+    std::string& model_name,                                   // NOLINT
+    rk::nn::Graph* rk_graph,                                   // NOLINT
+    std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes,  // NOLINT
+    std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes  // NOLINT
+    ) {
+  VLOG(3) << "[RKNPU] Build model";
+  rk_graph->SetInputsOutputs(input_nodes, output_nodes);
+  std::unique_ptr<rk::nn::Exection> exector =
+      std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(rk_graph));
+  exector->Build();
+  return exector;
+}
+}  // namespace rknpu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/rknpu/device.h
+++ b/lite/backends/rknpu/device.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "rknpu/rknpu_pub.h"  // NOLINT
+namespace paddle {
+namespace lite {
+namespace rknpu {
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {}
+  // Build the RK IR graph to om model, return RK model exector to
+  // load om model and run inference.
+  std::unique_ptr<rk::nn::Exection> Build(
+      std::string& model_name,                                   // NOLINT
+      rk::nn::Graph* rk_graph,                                   // NOLINT
+      std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes,  // NOLINT
+      std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes  // NOLINT
+      );                                                         // NOLINT
+ private:
+};
+}  // namespace rknpu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/CMakeLists.txt
+++ b/lite/backends/x86/CMakeLists.txt
@@ -10,7 +10,7 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
 lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
-lite_cc_library(x86_cpu_info SRCS cpu_info.cc DEPS xbyak)
+lite_cc_library(x86_cpu_info SRCS cpu_info.cc)
 add_subdirectory(jit)
 add_subdirectory(math)
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
@@ -262,7 +262,7 @@ void* GetTensorRtDsoHandle() {
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib");
+  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml.dylib");
 #elif defined(_WIN32)
  return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll");
 #else

--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
@@ -40,7 +40,7 @@ void MatMulJitCode::genCode() {
  for (size_t g = 0; g < groups.size(); ++g) {
    size_t x_offset = 0;
    size_t wgt_offset_tmp = 0;
-    for (int i = 0; i < g; ++i) {
+    for (size_t i = 0; i < g; ++i) {
      wgt_offset_tmp += groups[i] * block_len;
    }
    for (int k = 0; k < k_; ++k) {

--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
@@ -28,6 +28,12 @@
 #define posix_memalign_free free
 #endif
+#ifdef _WIN32
+#define posix_memalign_free _aligned_free
+#define posix_memalign(p, a, s) \
+  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#endif
 // DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
 bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");
@@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
 void* GenBase::operator new(size_t size) {
  void* ptr;
  constexpr size_t alignment = 32ul;
+#ifdef _WIN32
+  ptr = _aligned_malloc(size, alignment);
+#else
  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
                    0,
                    "GenBase Alloc %ld error!",
                    size);
+#endif
  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
  return ptr;
 }

--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -265,7 +265,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
    // size_t num_seqs = scores->NumElements(lod_level);
    size_t num_seqs = scores->lod()[lod_level].size() - 1;
    size_t seq_width = 1;
-    for (int i = 1; i < scores->dims().size(); i++) {
+    for (size_t i = 1; i < scores->dims().size(); i++) {
      seq_width *= scores->dims()[i];
    }

--- a/lite/backends/x86/math/blas.cc
+++ b/lite/backends/x86/math/blas.cc
@@ -23,7 +23,7 @@ namespace math {
 MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
                                     int num_flatten_cols,
                                     bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
+  PADDLE_ENFORCE_GT(tensor_dim.size(), 1u);
  MatDescriptor retv;
  if (num_flatten_cols > 1) {
    auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);

--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
@@ -46,9 +46,9 @@ class MaxSeqPoolFunctor {
    auto in_dims = input.dims();
    auto out_dims = output->dims();
    auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
+    for (size_t i = 1; i < in_dims.size(); ++i) {
      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
    }
    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
@@ -95,9 +95,9 @@ class MaxSeqPoolFunctor<T, true> {
                  lite::Tensor* index) {
    auto in_dims = input.dims();
    auto out_dims = output->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
+    for (size_t i = 1; i < in_dims.size(); ++i) {
      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
    }
@@ -138,7 +138,7 @@ class MaxSeqPoolGradFunctor {
    auto idx_dims = index.dims();
    PADDLE_ENFORCE_GT(og_dims.size(), 1);
    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
+    for (size_t i = 1; i < og_dims.size(); ++i) {
      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
    }
    PADDLE_ENFORCE_EQ(idx_dims, og_dims);

--- a/lite/backends/x86/parallel.h
+++ b/lite/backends/x86/parallel.h
@@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() {
  // Do not support nested omp parallem.
  num_threads = omp_in_parallel() ? 1 : omp_get_max_threads();
 #endif
-  return std::max(num_threads, 1L);
+  return std::max<int>(num_threads, 1L);
 }
 using ThreadHandler =

--- a/lite/backends/x86/port.h
+++ b/lite/backends/x86/port.h
@@ -14,10 +14,10 @@
 #pragma once
+#include <time.h>
 #include <cstdio>
 #include <stdexcept>
-#include <time.h>
 #include <memory>
 #include <string>
@@ -37,7 +37,9 @@
 #define GOOGLE_GLOG_DLL_DECL
 #include <io.h>  // _popen, _pclose
 #include <stdio.h>
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
 #include <windows.h>
+#include <winsock.h>
 #include <numeric>  // std::accumulate in msvc
 #ifndef S_ISDIR     // windows port for sys/stat.h
 #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
@@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) {
  return reinterpret_cast<void *>(hModule);
 }
+extern struct timeval;
 static int gettimeofday(struct timeval *tp, void *tzp) {
  time_t clock;
  struct tm tm;

--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -24,13 +24,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
    proto_library(framework_proto SRCS framework.proto)
 endif()
-if (LITE_WITH_X86)
 lite_cc_library(variable SRCS variable.cc DEPS tensor)
 lite_cc_library(types SRCS types.cc)
-else()
-lite_cc_library(variable SRCS variable.cc DEPS tensor)
-lite_cc_library(types SRCS types.cc)
-endif()
 lite_cc_library(op_registry SRCS op_registry.cc DEPS kernel)
 lite_cc_library(scope SRCS scope.cc DEPS tensor)
 lite_cc_library(device_info SRCS device_info.cc DEPS tensor)

--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -107,7 +107,7 @@ void TestCase::PrepareInputsForInstruction() {
          CHECK(!shared_tensor_array->empty())
              << "shared_tensor_array is empty yet";
          target_tensor_array->resize(shared_tensor_array->size());
-          for (int i = 0; i < shared_tensor_array->size(); i++) {
+          for (size_t i = 0; i < shared_tensor_array->size(); i++) {
            target_tensor_array->at(i).Resize(
                shared_tensor_array->at(i).dims());
            TargetCopy(param_type->type->target(),
@@ -219,7 +219,7 @@ bool TestCase::CheckPrecision(const std::string& var_name,
    auto b_tensor_array =
        base_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>();
    CHECK_EQ(a_tensor_array->size(), b_tensor_array->size());
-    for (int i = 0; i < a_tensor_array->size(); i++) {
+    for (size_t i = 0; i < a_tensor_array->size(); i++) {
      Tensor* a_tensor = &(a_tensor_array->at(i));
      Tensor* b_tensor = &(b_tensor_array->at(i));
      if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) {

--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -166,7 +166,7 @@ class TestCase {
  // TODO(Superjomn) Move this method to utils or DDim?
  bool ShapeEquals(const DDim& a, const DDim& b) {
    if (a.size() != b.size()) return false;
-    for (int i = 0; i < a.size(); i++) {
+    for (size_t i = 0; i < a.size(); i++) {
      if (a[i] != b[i]) return false;
    }
    return true;

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -54,11 +54,13 @@ using HostContext = Context<TargetType::kHost>;
 using X86Context = Context<TargetType::kX86>;
 using ARMContext = Context<TargetType::kARM>;
 using NPUContext = Context<TargetType::kNPU>;
+using APUContext = Context<TargetType::kAPU>;
 using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
 using MLUContext = Context<TargetType::kMLU>;
+using RKNPUContext = Context<TargetType::kRKNPU>;
 template <>
 class Context<TargetType::kHost> {
@@ -86,6 +88,21 @@ class Context<TargetType::kNPU> {
 };
 #endif
+#ifdef LITE_WITH_APU
+template <>
+class Context<TargetType::kAPU> {
+ public:
+  Context() {}
+  explicit Context(const APUContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(APUContext* ctx) {}
+  APUContext& operator=(const APUContext& ctx) {}
+  std::string name() const { return "APUContext"; }
+};
+#endif
 #ifdef LITE_WITH_BM
 template <>
 class Context<TargetType::kBM> {
@@ -103,6 +120,21 @@ class Context<TargetType::kBM> {
 };
 #endif
+#ifdef LITE_WITH_RKNPU
+template <>
+class Context<TargetType::kRKNPU> {
+ public:
+  Context() {}
+  explicit Context(const RKNPUContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(RKNPUContext* ctx) {}
+  RKNPUContext& operator=(const RKNPUContext& ctx) {}
+  std::string name() const { return "RKNPUContext"; }
+};
+#endif
 #ifdef LITE_WITH_XPU
 template <>
 class Context<TargetType::kXPU> {
@@ -392,6 +424,18 @@ class ContextScheduler {
            &ctx->As<NPUContext>());
        break;
 #endif
+#ifdef LITE_WITH_APU
+      case TARGET(kAPU):
+        kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
+            &ctx->As<APUContext>());
+        break;
+#endif
+#ifdef LITE_WITH_RKNPU
+      case TARGET(kRKNPU):
+        kernel_contexts_[TargetType::kRKNPU].As<RKNPUContext>().CopySharedTo(
+            &ctx->As<RKNPUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_XPU
      case TARGET(kXPU):
        kernel_contexts_[TargetType::kXPU].As<XPUContext>().CopySharedTo(
@@ -461,6 +505,12 @@ class ContextScheduler {
 #ifdef LITE_WITH_NPU
    InitContext<TargetType::kNPU, NPUContext>();
 #endif
+#ifdef LITE_WITH_APU
+    InitContext<TargetType::kAPU, APUContext>();
+#endif
+#ifdef LITE_WITH_RKNPU
+    InitContext<TargetType::kRKNPU, RKNPUContext>();
+#endif
 #ifdef LITE_WITH_XPU
    InitContext<TargetType::kXPU, XPUContext>();
 #endif

--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -947,7 +947,7 @@ void DeviceInfo::RequestPowerNoBindMode(int thread_num) {
    active_ids_ = core_ids_;
  } else {
    active_ids_.resize(thread_num);
-    for (int i = 0; i < thread_num; ++i) {
+    for (uint32_t i = 0; i < thread_num; ++i) {
      if (i < big_core_ids_.size()) {
        active_ids_[i] = big_core_ids_[i];
      } else {

--- a/lite/core/kernel.cc
+++ b/lite/core/kernel.cc
@@ -57,7 +57,7 @@ void KernelBase::ParseKernelType(const std::string &kernel_type,
                                 std::string *alias,
                                 Place *place) {
  auto parts = Split(kernel_type, "/");
-  CHECK_EQ(parts.size(), 5);
+  CHECK_EQ(parts.size(), 5u);
  *op_type = parts[0];
  *alias = parts[1];

--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -163,23 +163,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
-      for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+      for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
-        for (unsigned int i = 0; i < h; ++i) {
+        for (int i = 0; i < h; ++i) {
          weight_scale[i] *= fabsf(alpha_data[i]);
          if (alpha_data[i] < 0.f) {
            auto ptr_row = conv_weight_d + k * c_size + i * hw;
-            for (unsigned int j = 0; j < hw; ++j) {
+            for (int j = 0; j < hw; ++j) {
              ptr_row[j] *= -1;
            }
          }
        }
      }
    } else {
-      for (unsigned int i = 0; i < h; ++i) {
+      for (int i = 0; i < h; ++i) {
        weight_scale[i] *= fabsf(alpha_data[i]);
        if (alpha_data[i] < 0.f) {
          auto ptr_row = conv_weight_d + i * w;
-          for (unsigned int j = 0; j < w; ++j) {
+          for (int j = 0; j < w; ++j) {
            ptr_row[j] *= -1;
          }
        }
@@ -203,17 +203,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
-      for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+      for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
-        for (unsigned int i = 0; i < h; ++i) {
+        for (int i = 0; i < h; ++i) {
          auto ptr_row = conv_weight_d + k * c_size + i * hw;
-          for (unsigned int j = 0; j < hw; ++j) {
+          for (int j = 0; j < hw; ++j) {
            ptr_row[j] *= alpha_data[i];
          }
        }
      }
    } else {
-      for (unsigned int i = 0; i < h; ++i) {    // n: conv2d output channels
+      for (int i = 0; i < h; ++i) {    // n: conv2d output channels
-        for (unsigned int j = 0; j < w; ++j) {  // w: conv2d input channels
+        for (int j = 0; j < w; ++j) {  // w: conv2d input channels
          conv_weight_d[i * w + j] *= alpha_data[i];
        }
      }

--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -260,7 +260,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
  auto channel_scale_tensor =
      scope->FindVar(channel_scale_name)->GetMutable<lite::Tensor>();
  auto* channel_scale_data = channel_scale_tensor->data<float>();
-  for (int i = 0; i < channel_scale_tensor->data_size(); i++) {
+  for (size_t i = 0; i < channel_scale_tensor->data_size(); i++) {
    weight_scale.push_back(channel_scale_data[i] / range);
  }

--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -313,4 +313,8 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
    .BindTargets({TARGET(kARM), TARGET(kOpenCL)})
-    .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
+    .ExcludeTargets({TARGET(kNPU),
+                     TARGET(kXPU),
+                     TARGET(kBM),
+                     TARGET(kRKNPU),
+                     TARGET(kAPU)});
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -292,7 +292,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
  // get subgraph op's type info
  size_t kernel_size = inst_node->AsStmt().kernels().size();
-  CHECK_GT(kernel_size, 0);
+  CHECK_GT(kernel_size, 0u);
  VLOG(4) << "subgraph kernel size: " << kernel_size;
  for (size_t i = 0; i < kernel_size; ++i) {
@@ -450,7 +450,7 @@ bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
  auto* block_desc =
      static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
          ->GetSubBlock();
-  for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
+  for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
    CHECK(op_desc);
    if (op_desc->Type() == "conv2d") {

--- a/lite/core/mir/pass_registry.h
+++ b/lite/core/mir/pass_registry.h
@@ -59,6 +59,9 @@ class PassRegistry {
 }  // namespace lite
 }  // namespace paddle
+// some platform-independent defintion
+#include "lite/utils/macros.h"
 #define REGISTER_MIR_PASS(name__, class__)                                \
  paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__,      \
                                                            new class__); \
@@ -66,4 +69,4 @@ class PassRegistry {
    return mir_pass_registry##name__.Touch();                             \
  }                                                                       \
  static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__  \
-      __attribute__((unused)) = mir_pass_registry##name__
+      UNUSED = mir_pass_registry##name__
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply(
 REGISTER_MIR_PASS(quantized_op_attributes_inference_pass,
                  paddle::lite::mir::QuantizedOpAttributesInferencePass)
-    .BindTargets({TARGET(kNPU)});
+    .BindTargets({TARGET(kAPU), TARGET(kRKNPU)});
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -47,8 +47,8 @@ std::string SubgraphVisualizer::operator()() {
      "turquoise4",   "snow3",          "sienna4",        "salmon2",
  };
  std::unordered_map<Node *, int> subgraph_indices;
-  for (int i = 0; i < subgraphs_.size(); i++) {
+  for (size_t i = 0; i < subgraphs_.size(); i++) {
-    for (int j = 0; j < subgraphs_[i].size(); j++) {
+    for (size_t j = 0; j < subgraphs_[i].size(); j++) {
      subgraph_indices[subgraphs_[i][j]] = i;
    }
  }
@@ -538,7 +538,8 @@ void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph,
  std::vector<std::vector<Node *>> subgraphs =
      SubgraphDetector(graph, teller)();
  SubgraphVisualizer(graph, subgraphs)();
-  for (int subgraph_idx = 0; subgraph_idx < subgraphs.size(); subgraph_idx++) {
+  for (size_t subgraph_idx = 0; subgraph_idx < subgraphs.size();
+       subgraph_idx++) {
    if (subgraphs[subgraph_idx].size() >= min_subgraph_size) {
      InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]);
    }

--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -36,8 +36,8 @@ std::vector<std::string> AddFCDesc(
    const std::shared_ptr<Scope>& scope,
    const std::vector<std::string>& input_var_names,
    const std::vector<int64_t>& wshape) {
-  CHECK_EQ(input_var_names.size(), 1);
+  CHECK_EQ(input_var_names.size(), 1u);
-  CHECK_EQ(wshape.size(), 2);
+  CHECK_EQ(wshape.size(), 2u);
  static int id = 0;
  std::string prefix = "fc_" + paddle::lite::to_string(id);
  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
@@ -169,8 +169,8 @@ TEST(Subgraph, detect_simple_model) {
  };
  std::vector<std::vector<mir::Node*>> subgraphs =
      mir::SubgraphDetector(graph.get(), teller)();
-  ASSERT_EQ(subgraphs.size(), 1);
+  ASSERT_EQ(subgraphs.size(), 1u);
-  ASSERT_EQ(graph->nodes().size(), 9);
+  ASSERT_EQ(graph->nodes().size(), 9u);
  mir::SubgraphVisualizer(graph.get(), subgraphs)();
 }
@@ -221,7 +221,7 @@ TEST(Subgraph, detect_custom_model) {
  std::vector<std::vector<mir::Node*>> subgraphs =
      mir::SubgraphDetector(graph.get(), teller)();
  mir::SubgraphVisualizer(graph.get(), subgraphs)();
-  ASSERT_EQ(subgraphs.size(), 1);
+  ASSERT_EQ(subgraphs.size(), 1u);
 }
 }  // namespace lite

--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -40,6 +40,22 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }
+void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) \
+  supported_lists.insert(#op_type);          \
+  LOG(INFO) << #op_type
+#include "lite/kernels/apu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
 void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
  std::unordered_set<std::string> supported_lists;
@@ -69,6 +85,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }
+void RKNPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
 void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  std::unordered_set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
@@ -89,9 +119,13 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
    .BindTargets({TARGET(kNPU)});
+REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass)
+    .BindTargets({TARGET(kAPU)});
 REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
    .BindTargets({TARGET(kXPU)});
 REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
    .BindTargets({TARGET(kBM)});
+REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass)
+    .BindTargets({TARGET(kRKNPU)});
 REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
    .BindTargets({TARGET(kMLU)});
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
+class APUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
 class XPUSubgraphPass : public ProgramPass {
 public:
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
@@ -37,6 +42,11 @@ class BMSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
+class RKNPUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
 class MLUSubgraphPass : public ProgramPass {
 public:
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;

--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -39,7 +39,7 @@ std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
  std::vector<std::vector<int64_t>> shapes;
  std::vector<std::string> shape_strings = Split(text, ":");
  shapes.resize(shape_strings.size());
-  for (int i = 0; i < shape_strings.size(); i++) {
+  for (size_t i = 0; i < shape_strings.size(); i++) {
    std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
    for (auto shape_num : shape_nums) {
      shapes[i].push_back(atoi(shape_num.c_str()));
@@ -66,7 +66,7 @@ void FillInputTensors(
  for (int j = 0; j < input_tensor_size; j++) {                \
    input_tensor_data[j] = static_cast<type>(value);           \
  }
-  for (int i = 0; i < input_tensor_shape.size(); i++) {
+  for (size_t i = 0; i < input_tensor_shape.size(); i++) {
    auto input_tensor = predictor->GetInput(i);
    input_tensor->Resize(input_tensor_shape[i]);
    auto input_tensor_size = ShapeProduction(input_tensor->shape());
@@ -95,7 +95,7 @@ void CheckOutputTensors(
            << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff;        \
    EXPECT_LT(rel_diff, 0.1);                                                 \
  }
-  for (int i = 0; i < output_tensor_type.size(); i++) {
+  for (size_t i = 0; i < output_tensor_type.size(); i++) {
    auto tar_output_tensor = tar_predictor->GetOutput(i);
    auto ref_output_tensor = ref_predictor->GetOutput(i);
    auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());

--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -80,7 +80,7 @@ static bool InferScaleFromSubgraph(std::string var_name,
  auto input_or_output_scales = op_info->GetAttr<std::vector<float>>(attr_name);
  auto size = input_or_output_names.size();
  CHECK(size == input_or_output_scales.size());
-  for (int i = 0; i < size; i++) {
+  for (size_t i = 0; i < size; i++) {
    if (input_or_output_names[i] == var_name) {
      *scale = input_or_output_scales[i];
      return true;
@@ -137,18 +137,23 @@ void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
    nodes.push_back(node);
  }
+  // record the copied node.
+  std::unordered_map<std::string, Node*> cast_nodes;
  for (auto& node : nodes) {
    if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
    auto inlinks = node->inlinks;
    for (auto* in : inlinks) {
-      ComplementInputs(graph.get(), node, in);
+      ComplementInputs(graph.get(), node, in, &cast_nodes);
    }
  }
 }
-void PrecisionCastPass::ComplementInputs(SSAGraph* graph,
+void PrecisionCastPass::ComplementInputs(
-                                         Node* inst_node,
+    SSAGraph* graph,
-                                         Node* in) {
+    Node* inst_node,
+    Node* in,
+    std::unordered_map<std::string, Node*>* cast_nodes) {
  // If this input is out of date.
  if (inst_node->inlinks.end() ==
      std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in))
@@ -184,16 +189,19 @@ void PrecisionCastPass::ComplementInputs(SSAGraph* graph,
                in,
                graph,
                inst_node,
+                cast_nodes,
                graph->valid_places());
  }
 }
-void PrecisionCastPass::AddCastInst(const Type& from,
+void PrecisionCastPass::AddCastInst(
-                                    const Type& to,
+    const Type& from,
-                                    Node* in,
+    const Type& to,
-                                    SSAGraph* graph,
+    Node* in,
-                                    Node* inst_node,
+    SSAGraph* graph,
-                                    const std::vector<Place>& valid_places) {
+    Node* inst_node,
+    std::unordered_map<std::string, Node*>* cast_nodes,
+    const std::vector<Place>& valid_places) {
  CHECK(!valid_places.empty()) << "valid_place should be set";
  // var -> new_transform_op -> new_var -> inst
@@ -203,66 +211,80 @@ void PrecisionCastPass::AddCastInst(const Type& from,
  auto cast_op_output_name = in->AsArg().name + "/precision_trans";
  // in->AsArg().name + "/precision_trans/" +
  // paddle::lite::to_string(node_id());
-  auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
+  if (cast_nodes->count(in->AsArg().name)) {
-  cast_op_output_arg->AsArg().type =
+    // Remove the old link
-      LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
+    RemoveDirectedLink(in, inst_node);
-  auto* cast_inst = graph->NewInstructNode();
+    // Update the original instruction OpDesc.
+    // Update its input to the cast_op_output_name
+    // Add new link, newarg->inst
+    DirectedLink(cast_nodes->at(in->AsArg().name),
+                 inst_node);  // [io_copy kernel]'s output -> [current kernel]
+    // reset opdesc and update kernel information
+    UpdateInputs(
+        inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
+  } else {
+    auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
+    cast_op_output_arg->AsArg().type =
+        LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
+    auto* cast_inst = graph->NewInstructNode();
-  // create Op and kernels.
+    // create Op and kernels.
-  bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
+    bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
-  std::string cast_type = in_persist ? "calib_once" : "calib";
+    std::string cast_type = in_persist ? "calib_once" : "calib";
-  cast_op_output_arg->AsArg().is_persist = in_persist;
+    cast_op_output_arg->AsArg().is_persist = in_persist;
-  auto cast_op = LiteOpRegistry::Global().Create(cast_type);
+    auto cast_op = LiteOpRegistry::Global().Create(cast_type);
-  CHECK(cast_op) << "create op [" << cast_op << "] failed";
+    CHECK(cast_op) << "create op [" << cast_op << "] failed";
-  // Create the new var manually.
+    // Create the new var manually.
-  inst_node->AsStmt().op()->scope()->Var(cast_op_output_name);
+    inst_node->AsStmt().op()->scope()->Var(cast_op_output_name);
-  // Create Calib Instruction.
+    // Create Calib Instruction.
-  cpp::OpDesc op_desc;
+    cpp::OpDesc op_desc;
-  op_desc.SetType(cast_type);
+    op_desc.SetType(cast_type);
-  op_desc.SetInput("Input", {in->AsArg().name});
+    op_desc.SetInput("Input", {in->AsArg().name});
-  op_desc.SetOutput("Out", {cast_op_output_name});
+    op_desc.SetOutput("Out", {cast_op_output_name});
-  float scale;
+    float scale;
-  if (InferScale(in, inst_node, &scale)) {
+    if (InferScale(in, inst_node, &scale)) {
-    op_desc.SetAttr("scale", scale);
+      op_desc.SetAttr("scale", scale);
-  }
+    }
-  cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+    cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
-  auto kernels = cast_op->CreateKernels(valid_places);
+    auto kernels = cast_op->CreateKernels(valid_places);
-  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
+    std::vector<std::unique_ptr<KernelBase>> selected_kernels;
-  bool is_found = false;
+    bool is_found = false;
-  for (auto& kernel : kernels) {
+    for (auto& kernel : kernels) {
-    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
-    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-    if (TypeCompatible(*in_arg_ty, from) &&
+      if (TypeCompatible(*in_arg_ty, from) &&
-        out_arg_ty->precision() == to.precision()) {
+          out_arg_ty->precision() == to.precision()) {
-      is_found = true;
+        is_found = true;
-      selected_kernels.emplace_back(std::move(kernel));
+        selected_kernels.emplace_back(std::move(kernel));
-      // we pick the kernel
+        // we pick the kernel
-      cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op);
+        cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op);
-      break;
+        (*cast_nodes)[in->AsArg().name] = cast_op_output_arg;
+        break;
+      }
    }
-  }
-  CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":"
+    CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":"
-                  << in->AsArg().name << "->" << to << ":"
+                    << in->AsArg().name << "->" << to << ":"
-                  << inst_node->AsStmt().op_info()->Type();
+                    << inst_node->AsStmt().op_info()->Type();
-  // Remove the old link
+    // Remove the old link
-  RemoveDirectedLink(in, inst_node);
+    RemoveDirectedLink(in, inst_node);
-  // Update the original instruction OpDesc.
+    // Update the original instruction OpDesc.
-  // Update its input to the io_copy_output_name
+    // Update its input to the io_copy_output_name
-  // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
+    // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
-  DirectedLink(in, cast_inst);
+    DirectedLink(in, cast_inst);
-  DirectedLink(cast_inst, cast_op_output_arg);
+    DirectedLink(cast_inst, cast_op_output_arg);
-  DirectedLink(cast_op_output_arg, inst_node);
+    DirectedLink(cast_op_output_arg, inst_node);
-  // reset opdesc and update kernel information
+    // reset opdesc and update kernel information
-  UpdateInputs(
+    UpdateInputs(
-      inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
+        inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
+  }
  // recreate the op
  auto original_selected_kernel =

--- a/lite/core/mir/type_precision_cast_pass.h
+++ b/lite/core/mir/type_precision_cast_pass.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "lite/core/mir/pass.h"
 #include "lite/core/op_registry.h"
@@ -34,13 +35,17 @@ class PrecisionCastPass : public ProgramPass {
 public:
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-  void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in);
+  void ComplementInputs(SSAGraph* graph,
+                        Node* inst_node,
+                        Node* in,
+                        std::unordered_map<std::string, Node*>* cast_nodes);
  void AddCastInst(const Type& from,
                   const Type& to,
                   Node* in,
                   SSAGraph* graph,
                   Node* inst_node,
+                   std::unordered_map<std::string, Node*>* cast_nodes,
                   const std::vector<Place>& valid_places);
  void SetValidPlaces(const std::vector<Place>& valid_places);

--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -41,7 +41,7 @@ bool OpLite::InferShapeWithCache() {
       iter++) {
    // combined dims value into new_hash value.
    auto &element_dims = (*iter)->dims();
-    for (int i = 0; i < element_dims.size(); i++) {
+    for (size_t i = 0; i < element_dims.size(); i++) {
      new_hash =
          lite::hash_combine(new_hash, static_cast<int>(element_dims[i]));
    }
@@ -49,7 +49,7 @@ bool OpLite::InferShapeWithCache() {
    auto &emement_lods = (*iter)->lod();
    for (auto lod_iter = emement_lods.begin(); lod_iter != emement_lods.end();
         lod_iter++) {
-      for (int i = 0; i < lod_iter->size(); i++) {
+      for (size_t i = 0; i < lod_iter->size(); i++) {
        new_hash =
            lite::hash_combine(new_hash, static_cast<int>(lod_iter->at(i)));
      }
@@ -60,7 +60,7 @@ bool OpLite::InferShapeWithCache() {
    // if current hash value is consistent with io_shape_lod_hash_,
    // previous outputs shape and lod are reused.
    auto *current_outputs = param_.output_tensor_ptrs();
-    for (int i = 0; i < current_outputs->size(); i++) {
+    for (size_t i = 0; i < current_outputs->size(); i++) {
      current_outputs->at(i)->Resize(last_output_shapes[i]);
      current_outputs->at(i)->set_lod(last_output_lods[i]);
    }
@@ -69,7 +69,7 @@ bool OpLite::InferShapeWithCache() {
    io_shape_lod_hash_ = new_hash;
    this->InferShapeImpl();
    auto *current_outputs = param_.output_tensor_ptrs();
-    for (int i = 0; i < current_outputs->size(); i++) {
+    for (size_t i = 0; i < current_outputs->size(); i++) {
      last_output_shapes[i] = current_outputs->at(i)->dims();
      last_output_lods[i] = current_outputs->at(i)->lod();
    }

--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -98,6 +98,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
    case TARGET(kNPU): {
      CREATE_KERNEL(kNPU);
    } break;
+    case TARGET(kAPU): {
+      CREATE_KERNEL(kAPU);
+    } break;
    case TARGET(kXPU): {
      CREATE_KERNEL(kXPU);
    } break;
@@ -110,6 +113,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
    case TARGET(kMLU): {
      CREATE_KERNEL(kMLU);
    } break;
+    case TARGET(kRKNPU): {
+      CREATE_KERNEL(kRKNPU);
+    } break;
    default:
      CHECK(false) << "not supported kernel target " << TargetToStr(target);
  }
@@ -217,6 +223,7 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kNPU, kAny, kNHWC);
  INIT_FOR(kNPU, kAny, kAny);
+  INIT_FOR(kAPU, kInt8, kNCHW);
  INIT_FOR(kXPU, kFloat, kNCHW);
  INIT_FOR(kXPU, kInt8, kNCHW);
  INIT_FOR(kXPU, kAny, kNCHW);
@@ -232,6 +239,11 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kBM, kInt8, kNCHW);
  INIT_FOR(kBM, kAny, kNCHW);
  INIT_FOR(kBM, kAny, kAny);
+  INIT_FOR(kRKNPU, kFloat, kNCHW);
+  INIT_FOR(kRKNPU, kInt8, kNCHW);
+  INIT_FOR(kRKNPU, kAny, kNCHW);
+  INIT_FOR(kRKNPU, kAny, kAny);
 #undef INIT_FOR
 }

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -111,18 +111,23 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kCUDA),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNHWC)> *,  //
              KernelRegistryForTarget<TARGET(kX86),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kX86),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kHost),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //
@@ -141,9 +146,7 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kHost),
                                      PRECISION(kInt64),
                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
              KernelRegistryForTarget<TARGET(kARM),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //
@@ -231,6 +234,9 @@ class KernelRegistry final {
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kAPU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kXPU),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //
@@ -251,6 +257,16 @@ class KernelRegistry final {
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kFPGA),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //
@@ -435,32 +451,31 @@ class KernelRegistor : public lite::Registor<KernelType> {
 #define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \
  LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__)
-#define REGISTER_LITE_KERNEL(                                                  \
+#define REGISTER_LITE_KERNEL(                                                 \
-    op_type__, target__, precision__, layout__, KernelClass, alias__)          \
+    op_type__, target__, precision__, layout__, KernelClass, alias__)         \
-  static paddle::lite::KernelRegistor<TARGET(target__),                        \
+  static paddle::lite::KernelRegistor<TARGET(target__),                       \
-                                      PRECISION(precision__),                  \
+                                      PRECISION(precision__),                 \
-                                      DATALAYOUT(layout__),                    \
+                                      DATALAYOUT(layout__),                   \
-                                      KernelClass>                             \
+                                      KernelClass>                            \
-      LITE_KERNEL_REGISTER_INSTANCE(                                           \
+      LITE_KERNEL_REGISTER_INSTANCE(                                          \
-          op_type__, target__, precision__, layout__, alias__)(#op_type__,     \
+          op_type__, target__, precision__, layout__, alias__)(#op_type__,    \
-                                                               #alias__);      \
+                                                               #alias__);     \
-  static KernelClass LITE_KERNEL_INSTANCE(                                     \
+  static KernelClass LITE_KERNEL_INSTANCE(                                    \
-      op_type__, target__, precision__, layout__, alias__);                    \
+      op_type__, target__, precision__, layout__, alias__);                   \
-  int touch_##op_type__##target__##precision__##layout__##alias__() {          \
+  int touch_##op_type__##target__##precision__##layout__##alias__() {         \
-    OpKernelInfoCollector::Global().AddKernel2path(                            \
+    OpKernelInfoCollector::Global().AddKernel2path(                           \
-        #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__,  \
+        #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \
-        __FILE__);                                                             \
+        __FILE__);                                                            \
-    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__)  \
+    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \
-        .Touch();                                                              \
+        .Touch();                                                             \
-    return 0;                                                                  \
+    return 0;                                                                 \
-  }                                                                            \
+  }                                                                           \
-  static bool LITE_KERNEL_PARAM_INSTANCE(                                      \
+  static bool LITE_KERNEL_PARAM_INSTANCE(                                     \
-      op_type__, target__, precision__, layout__, alias__)                     \
+      op_type__, target__, precision__, layout__, alias__) UNUSED =           \
-      __attribute__((unused)) =                                                \
+      paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),          \
-          paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),       \
+                                                   PRECISION(precision__),    \
-                                                       PRECISION(precision__), \
+                                                   DATALAYOUT(layout__)>(     \
-                                                       DATALAYOUT(layout__)>(  \
+          #op_type__ "/" #alias__)
-              #op_type__ "/" #alias__)
 #define LITE_KERNEL_INSTANCE(                            \
    op_type__, target__, precision__, layout__, alias__) \

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -86,6 +86,8 @@ class Optimizer {
           "npu_subgraph_pass",
           "xpu_subgraph_pass",
           "bm_subgraph_pass",
+           "apu_subgraph_pass",
+           "rknpu_subgraph_pass",
           "static_kernel_pick_pass",        // pick original kernel from graph
           "variable_place_inference_pass",  // inference arg/var's
           // info(target/precision/layout/device)

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -72,7 +72,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
  std::unordered_map<std::string, cpp::VarDesc> origin_var_maps;
  auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
  auto var_size = main_block.VarsSize();
-  for (int i = 0; i < var_size; i++) {
+  for (size_t i = 0; i < var_size; i++) {
    auto v = main_block.GetVar<cpp::VarDesc>(i);
    auto name = v->Name();
    origin_var_maps.emplace(name, *v);

--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -100,7 +100,7 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
 void TensorLite::ResetBuffer(std::shared_ptr<Buffer> buffer,
                             size_t memory_size) {
-  CHECK_EQ(offset_, 0)
+  CHECK_EQ(offset_, 0u)
      << "Only the offset is supported to zero when the Buffer is reset.";
  if (buffer_) {
    CHECK_LE(memory_size_, buffer->space())

--- a/lite/core/types.h
+++ b/lite/core/types.h
@@ -30,7 +30,7 @@ namespace core {
 // TODO(Superjomn) unify all the type representation across the lite framework.
 enum class Type {
  UNK = -1,
-  // primary types
+  // primary typesINT32,
  INT32,
  INT64,
  FLOAT32,
@@ -92,6 +92,8 @@ Type StdTypeToRepr<float>();
 template <>
 Type StdTypeToRepr<bool>();
 template <>
+Type StdTypeToRepr<double>();
+template <>
 Type StdTypeToRepr<std::vector<char>>();
 template <>
 Type StdTypeToRepr<std::string>();

--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -18,6 +18,11 @@
 #include "paddle_api.h"         // NOLINT
 #include "paddle_use_passes.h"  // NOLINT
+#if defined(_WIN32)
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#endif
 using namespace paddle::lite_api;  // NOLINT
 DEFINE_string(model_dir, "", "Model dir path.");

--- a/lite/demo/python/mobilenetv1_full_api.py
+++ b/lite/demo/python/mobilenetv1_full_api.py
@@ -23,7 +23,7 @@ import argparse
 import sys
 sys.path.append('../../python/lib')
-from lite_core import *
+from paddlelite.lite import *
 # Command arguments
 parser = argparse.ArgumentParser()

--- a/lite/demo/python/mobilenetv1_light_api.py
+++ b/lite/demo/python/mobilenetv1_light_api.py
@@ -23,7 +23,7 @@ import argparse
 import sys
 sys.path.append('../../python/lib')
-from lite_core import *
+from paddlelite.lite import *
 # Command arguments
 parser = argparse.ArgumentParser()

--- a/lite/fluid/data_type.cc
+++ b/lite/fluid/data_type.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "lite/fluid/data_type.h"
 #include <stdint.h>
 #include <string>

--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
        X86_DEPS ${x86_kernels}
        ARM_DEPS ${arm_kernels}
        NPU_DEPS ${npu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -43,6 +44,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
    X86_DEPS ${x86_kernels}
    ARM_DEPS ${arm_kernels}
    NPU_DEPS ${npu_kernels}
+    RKNPU_DEPS ${rknpu_kernels}
    XPU_DEPS ${xpu_kernels}
    CL_DEPS ${opencl_kernels}
    FPGA_DEPS ${fpga_kernels}

--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -11,4 +11,6 @@ add_subdirectory(fpga)
 add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
+add_subdirectory(apu)
 add_subdirectory(bm)
+add_subdirectory(rknpu)
--- a/lite/kernels/apu/CMakeLists.txt
+++ b/lite/kernels/apu/CMakeLists.txt
+add_subdirectory(bridges)
+add_kernel(subgraph_compute_apu APU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_apu subgraph_bridge_engine ${apu_subgraph_bridges})
--- a/lite/kernels/apu/bridges/CMakeLists.txt
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+lite_cc_library(subgraph_bridge_utility_apu SRCS utility.cc DEPS tensor)
+lite_cc_library(subgraph_bridge_graph_apu SRCS graph.cc DEPS subgraph_bridge_utility_apu)
+set(apu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_apu subgraph_bridge_graph_apu)
+lite_cc_library(subgraph_bridge_conv_op_apu SRCS conv_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_apu SRCS elementwise_ops.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
+set(apu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_apu
+        subgraph_bridge_conv_op_apu
+        subgraph_bridge_elementwise_ops_apu
+        subgraph_bridge_act_op_apu
+        subgraph_bridge_softmax_op_apu
+        subgraph_bridge_fc_op_apu
+        subgraph_bridge_pool_op_apu
+        CACHE INTERNAL "apu_subgraph_bridges")
+message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
--- a/lite/kernels/apu/bridges/act_op.cc
+++ b/lite/kernels/apu/bridges/act_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting " + op_type + "...";
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  return SUCCESS;
+}
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(relu, kAPU, paddle::lite::subgraph::apu::ActConverter);
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
--- a/lite/kernels/apu/bridges/elementwise_ops.cc
+++ b/lite/kernels/apu/bridges/elementwise_ops.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting " + op_type + "...";
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto y_name = op_info->Input("Y").front();
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+  auto axis = op_info->GetAttr<int>("axis");
+  // Act node
+  if (op_type == "fusion_elementwise_add_activation" ||
+      op_type == "fusion_elementwise_sub_activation" ||
+      op_type == "fusion_elementwise_mul_activation" ||
+      op_type == "fusion_elementwise_div_activation") {
+    auto act_type = op_info->GetAttr<std::string>("act_type");
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ElementwiseConverter);
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting [" + op_type + "]";
+  auto libHandle = graph->libHandle();
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand)
+  LOAD_FUNCTIONS(
+      libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue)
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation)
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  CHECK_GE(input_dims.size(), 2UL);
+  auto w_name = op_info->Input("W").front();
+  auto w = scope->FindMutableTensor(w_name);
+  auto w_dims = w->dims();
+  CHECK_EQ(w_dims.size(), 2UL);
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  int m = input_dims.Slice(0, in_num_col_dims).production();
+  int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
+  int n = w_dims[1];
+  CHECK_EQ(k * n, w_dims.production());
+  VLOG(3) << "[APU] input dims: " << input_dims << " w dims: " << w_dims
+          << " out_dims: " << out_dims << " m: " << m << " k: " << k
+          << " n: " << n;
+  float input_scale = 1.0f;
+  float out_scale = 1.0f;
+  std::vector<float> w_scale;
+  if (op_info->HasAttr("enable_int8")) {
+    if (op_info->GetAttr<bool>("enable_int8")) {
+      if (op_info->HasAttr("input_scale"))
+        input_scale = op_info->GetAttr<float>("input_scale");
+      if (op_info->HasAttr("weight_scale"))
+        w_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+      if (op_info->HasAttr("output_scale"))
+        out_scale = op_info->GetAttr<float>("output_scale");
+    } else {
+      return FAILED;
+    }
+  } else {
+    return FAILED;
+  }
+  // Add input tensor type
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = input_scale;
+  inType.zeroPoint = 128;
+  inType.dimensionCount = input_dims.size();
+  std::vector<uint32_t> dims_in = {(uint32_t)input_dims[0],
+                                   (uint32_t)input_dims[2],
+                                   (uint32_t)input_dims[3],
+                                   (uint32_t)input_dims[1]};
+  inType.dimensions = &dims_in[0];
+  std::shared_ptr<Node> in_node = nullptr;
+  if (graph->Has(input_name)) {
+    // input operand already exist
+    in_node = graph->Get(input_name);
+    VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index();
+  } else {
+    // add input operand
+    (*neuron_model_addOperand)(model, &inType);  // 0: input
+    in_node = graph->Add(input_name, dims_in);
+  }
+  VLOG(3) << "input_scale: " << input_scale
+          << ", inType: " << inType.dimensions[0] << " : "
+          << inType.dimensions[1] << " : " << inType.dimensions[2] << " : "
+          << inType.dimensions[3];
+  NeuronOperandType wType;
+  wType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  wType.scale = w_scale[0];
+  wType.zeroPoint = 128;
+  wType.dimensionCount = w_dims.size();
+  std::vector<uint32_t> dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]};
+  wType.dimensions = &dims_w[0];
+  (*neuron_model_addOperand)(model, &wType);  // 1: weight
+  std::shared_ptr<Node> w_node = nullptr;
+  w_node = graph->Add(w_name, dims_w);
+  VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0]
+          << ", wType dimensions: " << wType.dimensions[0] << " : "
+          << wType.dimensions[1] << ", memory size: " << w->memory_size();
+  // Add bias type
+  NeuronOperandType biasType;
+  biasType.type = NEURON_TENSOR_INT32;
+  biasType.zeroPoint = 0;
+  biasType.scale = input_scale * w_scale[0];
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias_type = kernel->GetInputDeclType("Bias");
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+    biasType.dimensionCount = bias_dims.size();
+    std::vector<uint32_t> dims_bias = {(uint32_t)bias_dims[0]};
+    biasType.dimensions = &dims_bias[0];
+    (*neuron_model_addOperand)(model, &biasType);  // 2: bias
+    bias_node = graph->Add(bias_name, dims_bias);
+    VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims
+            << ", bias scale: " << biasType.scale
+            << " ,memory size: " << bias->memory_size();
+  } else {
+    biasType.dimensionCount = 1;
+    std::vector<uint32_t> dims_bias = {(uint32_t)n};
+    biasType.dimensions = &dims_bias[0];
+    (*neuron_model_addOperand)(model, &biasType);  // 2: bias
+    bias_node = graph->Add(w_name + "_default_bias", dims_bias);
+  }
+  // Add fuse type
+  NeuronOperandType fuseType;
+  fuseType.type = NEURON_INT32;
+  fuseType.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {0};
+  (*neuron_model_addOperand)(model, &fuseType);  // 3: fuse
+  std::shared_ptr<Node> fuse_node = nullptr;
+  fuse_node = graph->Add(w_name + "_fuse", dims_int32);
+  // Add output tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = out_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = 2;
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0], out_dims[1]};
+  outType.dimensions = &dims_out[0];
+  VLOG(3) << "out_scale: " << out_scale
+          << ", outType: " << outType.dimensions[0] << " : "
+          << outType.dimensions[1];
+  (*neuron_model_addOperand)(model, &outType);  // output
+  std::shared_ptr<Node> out_node = nullptr;
+  out_node = graph->Add(out_name, dims_out);
+  int8_t* w_data = w->mutable_data<int8_t>();
+  Tensor transpose_filter;
+  // Original dimension
+  transpose_filter.Resize({(uint32_t)w_dims[1], (uint32_t)w_dims[0]});
+  transpose_filter.mutable_data<uint8_t>();
+  transposeAsym(w->data<int8_t>(),
+                transpose_filter.mutable_data<uint8_t>(),
+                {1, 1, (uint32_t)w_dims[0], (uint32_t)w_dims[1]},
+                {0, 1, 3, 2});
+  memcpy(w->mutable_data<int8_t>(),
+         transpose_filter.mutable_data<uint8_t>(),
+         w->memory_size());
+  int neuron_errCode = (*neuron_model_setOperandValue)(
+      model, w_node->index(), w->raw_data(), w->memory_size());
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set W operand value fail:" << neuron_errCode
+                 << ",index: " << w_node->index();
+    return FAILED;
+  }
+  // Add bias if bias tensor exists
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+    int32_t* int32_bias_data =
+        reinterpret_cast<int32_t*>(bias->mutable_data<float>());
+    float2int32(bias->data<float>(), input_scale, w_scale, int32_bias_data);
+    VLOG(3) << int32_bias_data[0] << ":" << int32_bias_data[1] << ":"
+            << int32_bias_data[2] << ":" << int32_bias_data[3];
+    neuron_errCode =
+        (*neuron_model_setOperandValue)(model,
+                                        bias_node->index(),
+                                        bias->raw_data(),
+                                        bias->memory_size());  // 2: bias
+  } else {
+    auto int32_bias = std::make_shared<Tensor>();
+    int32_bias->Resize({1, out_dims[1]});
+    int32_bias->mutable_data<int32_t>();
+    memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
+    VLOG(3) << "default: " << int32_bias->memory_size();
+    neuron_errCode =
+        (*neuron_model_setOperandValue)(model,
+                                        bias_node->index(),
+                                        int32_bias->raw_data(),
+                                        int32_bias->memory_size());  // 2: bias
+    bias_node->set_data(int32_bias);
+  }
+  // Add fuse value
+  int32_t fuse_val[1] = {0};
+  (*neuron_model_setOperandValue)(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);  // 3: fuse
+  std::vector<uint32_t> addInIndex = {in_node->index(),
+                                      w_node->index(),
+                                      bias_node->index(),
+                                      fuse_node->index()};
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  neuron_errCode = (*neuron_model_addOperation)(model,
+                                                NEURON_FULLY_CONNECTED,
+                                                addInIndex.size(),
+                                                &addInIndex[0],
+                                                addOutIndex.size(),
+                                                &addOutIndex[0]);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return FAILED;
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(fc, kAPU, paddle::lite::subgraph::apu::FCConverter);
--- a/lite/kernels/apu/bridges/graph.cc
+++ b/lite/kernels/apu/bridges/graph.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/apu/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/apu/bridges/utility.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    LOG(FATAL) << "[APU] Node" << name << " is redefined.";
+    return -1;
+  } else {
+    VLOG(3) << " Add: " << name << " : " << node->index();
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  operandIdx_ += 1;
+  it->second.push_back(node);
+  return it->second.size();
+}
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/apu/bridges/graph.h
+++ b/lite/kernels/apu/bridges/graph.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "NeuronAdapter.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+// Graph and node is defined to collect all of converted HiAI IR nodes
+class Node {
+ public:
+  Node(int32_t operand_idx, std::vector<uint32_t> shape)
+      : idx_(operand_idx), shape_(shape) {}
+  void set_shape(std::vector<uint32_t> shape) { shape_ = shape; }
+  uint32_t index() { return idx_; }
+  std::vector<uint32_t> shape() const { return shape_; }
+  void set_data(std::shared_ptr<Tensor> data) { data_ = data; }
+ private:
+  int32_t idx_;
+  std::vector<uint32_t> shape_;
+  std::shared_ptr<Tensor> data_{nullptr};
+};
+class Graph {
+ public:
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+  // Variable, const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<uint32_t> shape) {
+    CHECK(shape.size()) << name << " : " << shape.size();
+    auto node = std::make_shared<Node>(operandIdx_, shape);
+    auto idx = Add(name, node);
+    CHECK_GE(idx, 1);
+    return node;
+  }
+  void set_model(NeuronModel* model) { model_ = model; }
+  NeuronModel* model() { return model_; }
+  void set_libHandle(void* libHandle) { libHandle_ = libHandle; }
+  void* libHandle() { return libHandle_; }
+  void set_input_names(const std::vector<std::string> input_names) {
+    input_names_ = input_names;
+  }
+  bool IsInput(const std::string& name) {
+    for (int i = 0; i < input_names_.size(); i++) {
+      if (input_names_[i] == name) return true;
+    }
+    return false;
+  }
+  bool IsOutput(const std::string& name) {
+    for (int i = 0; i < output_names_.size(); i++) {
+      if (output_names_[i] == name) return true;
+    }
+    return false;
+  }
+  void set_output_names(const std::vector<std::string> output_names) {
+    output_names_ = output_names;
+  }
+  std::shared_ptr<Node> Get(std::string name) {
+    CHECK(Has(name)) << "[APU] Node " << name << " not found.";
+    return nodes_.at(name).back();
+  }
+  bool Has(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+ private:
+  void* libHandle_;
+  NeuronModel* model_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
+  int32_t operandIdx_ = 0;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
+};
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/apu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/apu/bridges/paddle_use_bridges.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+USE_SUBGRAPH_BRIDGE(relu, kAPU);
+USE_SUBGRAPH_BRIDGE(conv2d, kAPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kAPU);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kAPU);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU);
+USE_SUBGRAPH_BRIDGE(fc, kAPU);
+USE_SUBGRAPH_BRIDGE(pool2d, kAPU);
+USE_SUBGRAPH_BRIDGE(softmax, kAPU);
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
--- a/lite/kernels/apu/bridges/utility.cc
+++ b/lite/kernels/apu/bridges/utility.cc
--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/mean_grad_compute.cc
+++ b/lite/kernels/arm/mean_grad_compute.cc
--- a/lite/kernels/arm/while_compute.cc
+++ b/lite/kernels/arm/while_compute.cc
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/arm/is_empty_compute.cc
+++ b/lite/kernels/arm/is_empty_compute.cc
--- a/lite/kernels/arm/is_empty_compute.h
+++ b/lite/kernels/arm/is_empty_compute.h
--- a/lite/kernels/arm/logical_compute.cc
+++ b/lite/kernels/arm/logical_compute.cc
--- a/lite/kernels/arm/logical_compute.h
+++ b/lite/kernels/arm/logical_compute.h
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
--- a/lite/kernels/mlu/bridges/concat_op_test.cc
+++ b/lite/kernels/mlu/bridges/concat_op_test.cc
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
--- a/lite/kernels/mlu/bridges/interpolate_op_test.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc
--- a/lite/kernels/mlu/bridges/utility.cc
+++ b/lite/kernels/mlu/bridges/utility.cc
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
--- a/lite/kernels/rknpu/CMakeLists.txt
+++ b/lite/kernels/rknpu/CMakeLists.txt
--- a/lite/kernels/rknpu/bridges/CMakeLists.txt
+++ b/lite/kernels/rknpu/bridges/CMakeLists.txt
--- a/lite/kernels/rknpu/bridges/act_op.cc
+++ b/lite/kernels/rknpu/bridges/act_op.cc
--- a/lite/kernels/rknpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/rknpu/bridges/batch_norm_op.cc
--- a/lite/kernels/rknpu/bridges/concat_op.cc
+++ b/lite/kernels/rknpu/bridges/concat_op.cc
--- a/lite/kernels/rknpu/bridges/conv_op.cc
+++ b/lite/kernels/rknpu/bridges/conv_op.cc
--- a/lite/kernels/rknpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/rknpu/bridges/elementwise_ops.cc
--- a/lite/kernels/rknpu/bridges/fc_op.cc
+++ b/lite/kernels/rknpu/bridges/fc_op.cc
--- a/lite/kernels/rknpu/bridges/graph.cc
+++ b/lite/kernels/rknpu/bridges/graph.cc
--- a/lite/kernels/rknpu/bridges/graph.h
+++ b/lite/kernels/rknpu/bridges/graph.h
--- a/lite/kernels/rknpu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/rknpu/bridges/paddle_use_bridges.h
--- a/lite/kernels/rknpu/bridges/pool_op.cc
+++ b/lite/kernels/rknpu/bridges/pool_op.cc
--- a/lite/kernels/rknpu/bridges/softmax_op.cc
+++ b/lite/kernels/rknpu/bridges/softmax_op.cc
--- a/lite/kernels/rknpu/bridges/utility.cc
+++ b/lite/kernels/rknpu/bridges/utility.cc
--- a/lite/kernels/rknpu/bridges/utility.h
+++ b/lite/kernels/rknpu/bridges/utility.h
--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
--- a/lite/kernels/x86/gather_compute.h
+++ b/lite/kernels/x86/gather_compute.h
--- a/lite/kernels/x86/layer_norm_compute_test.cc
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
--- a/lite/kernels/x86/sequence_expand_as_compute.h
+++ b/lite/kernels/x86/sequence_expand_as_compute.h
--- a/lite/kernels/x86/sequence_reverse_compute_test.cc
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
--- a/lite/kernels/x86/shape_compute.h
+++ b/lite/kernels/x86/shape_compute.h
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
--- a/lite/kernels/x86/slice_compute_test.cc
+++ b/lite/kernels/x86/slice_compute_test.cc
--- a/lite/kernels/x86/stack_compute.h
+++ b/lite/kernels/x86/stack_compute.h
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/model_parser/model_parser_test.cc
+++ b/lite/model_parser/model_parser_test.cc
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
--- a/lite/operators/expand_op.cc
+++ b/lite/operators/expand_op.cc
--- a/lite/operators/fill_constant_batch_size_like_op.cc
+++ b/lite/operators/fill_constant_batch_size_like_op.cc
--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
--- a/lite/operators/flatten_op.cc
+++ b/lite/operators/flatten_op.cc
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
--- a/lite/operators/is_empty_op.cc
+++ b/lite/operators/is_empty_op.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
--- a/lite/operators/reduce_mean_op.cc
+++ b/lite/operators/reduce_mean_op.cc
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
--- a/lite/operators/squeeze_op.cc
+++ b/lite/operators/squeeze_op.cc
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
--- a/lite/tests/api/test_mobilenetv1_int8_apu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_apu.cc
--- a/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/is_empty_compute_test.cc
+++ b/lite/tests/kernels/is_empty_compute_test.cc
--- a/lite/tests/kernels/logical_compute_test.cc
+++ b/lite/tests/kernels/logical_compute_test.cc
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_rknpu.sh
+++ b/lite/tools/build_rknpu.sh
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
--- a/lite/utils/string.h
+++ b/lite/utils/string.h
--- a/lite/utils/variant.h
+++ b/lite/utils/variant.h
--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h