Merge remote-tracking branch 'origin' into test_result

80af11c6 · jiweibo · 3394388c · 913c1caf · 80af11c6 · 80af11c6
256 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 message(STATUS "AR tools: ${CMAKE_AR}")

+
+if(WIN32)
+    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+
+    set(CMAKE_SUPPRESS_REGENERATION ON)
+    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+
+    if (MSVC_STATIC_CRT)
+      set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+      set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+      set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+      set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    endif()
+
+    add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
+    add_compile_options(/MP)
+    message(STATUS "Using parallel compiling (/MP)")
+    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
+    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+
+endif()
+
 if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    find_package(CUDA QUIET)
 endif()
@@ -64,6 +89,7 @@ lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
+lite_option(LITE_WITH_APU  "Enable APU in lite mode"  OFF)
 lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
@@ -106,9 +132,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
+    if(WIN32)
+        set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+        FORCE)
+    else()
+    
    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
            "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
            FORCE)
+    endif()
 endif()
 message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")

@@ -141,6 +174,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    include(cross_compiling/postproject)
    include(device/npu) # check and prepare NPU DDK
    include(device/xpu) # check and prepare XPU SDK
+    include(device/apu) # check and prepare APU SDK

    # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON
    # So the following third party dependencies are not needed.
@@ -190,6 +224,7 @@ endif()

 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
+
 include(external/libxsmm)   # download, build, install libxsmm
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog

--- a/build.bat
+++ b/build.bat
+@echo off
+setlocal
+setlocal enabledelayedexpansion
+
+set source_path=%~dp0
+rem  global variables
+set BUILD_EXTRA=OFF
+set BUILD_JAVA=ON
+set BUILD_PYTHON=OFF
+set BUILD_DIR=%source_path%
+set OPTMODEL_DIR=""
+set BUILD_TAILOR=OFF
+set BUILD_CV=OFF
+set SHUTDOWN_LOG=ON  
+
+set THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+
+set workspace=%source_path%
+
+:set_vcvarsall_dir
+SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat   =======>"
+set tmp_var=!vcvarsall_dir!
+call:remove_space
+set vcvarsall_dir=!tmp_var!   
+IF NOT EXIST "%vcvarsall_dir%" (
+    echo "------------%vcvarsall_dir% not exist------------"
+    goto set_vcvarsall_dir
+)
+
+call:prepare_thirdparty
+
+if EXIST "%build_directory%" (
+    call:rm_rebuild_dir "%build_directory%"
+    md "%build_directory%"
+) 
+
+set root_dir=%workspace%
+set build_directory=%BUILD_DIR%\build.lite.x86
+set GEN_CODE_PATH_PREFIX=%build_directory%\lite\gen_code
+set DEBUG_TOOL_PATH_PREFIX=%build_directory%\lite\tools\debug
+
+rem for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
+rem here we fake an empty file to make cmake works.
+if NOT EXIST "%GEN_CODE_PATH_PREFIX%" (
+    md "%GEN_CODE_PATH_PREFIX%"
+)
+
+type nul >"%GEN_CODE_PATH_PREFIX%\__generated_code__.cc"
+
+if NOT EXIST "%DEBUG_TOOL_PATH_PREFIX%" (
+     md "%DEBUG_TOOL_PATH_PREFIX%"
+)
+
+copy "%root_dir%\lite\tools\debug\analysis_tool.py" "%DEBUG_TOOL_PATH_PREFIX%\"
+
+cd "%build_directory%"
+
+  cmake ..   -G "Visual Studio 14 2015 Win64" -T host=x64  -DWITH_MKL=ON      ^
+            -DWITH_MKLDNN=OFF   ^
+            -DLITE_WITH_X86=ON  ^
+            -DLITE_WITH_PROFILE=OFF ^
+            -DWITH_LITE=ON ^
+            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF ^
+            -DLITE_WITH_ARM=OFF ^
+            -DWITH_GPU=OFF ^
+            -DLITE_BUILD_EXTRA=ON ^
+            -DLITE_WITH_PYTHON=ON ^
+            -DPYTHON_EXECUTABLE="%python_path%"
+
+call "%vcvarsall_dir%" amd64
+
+msbuild /m /p:Configuration=Release lite\publish_inference.vcxproj >mylog.txt 2>&1
+goto:eof
+
+:prepare_thirdparty 
+    SET /P python_path="Please input the path of python.exe, such as C:\Python35\python.exe, C:\Python35\python3.exe   =======>"
+    set tmp_var=!python_path!
+    call:remove_space
+    set python_path=!tmp_var!   
+    if "!python_path!"=="" (
+      set python_path=python.exe
+    ) else (
+      if NOT exist "!python_path!" (
+        echo "------------!python_path! not exist------------" 
+        goto:eof
+      )  
+    )
+
+    if  EXIST "%workspace%\third-party" (
+        if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
+            echo "The directory of third_party exists, the third-party-05b862.tar.gz not exists."            
+        ) else (
+               echo "The directory of third_party exists, the third-party-05b862.tar.gz exists."
+               call:rm_rebuild_dir "%workspace%\third-party"
+               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        )
+    ) else (
+        if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
+            echo "The directory of third_party not exists, the third-party-05b862.tar.gz not exists."
+            call:download_third_party
+            !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        ) else (
+            echo "The directory of third_party not exists, the third-party-05b862.tar.gz exists."
+               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+        )
+
+    )
+    git submodule update --init --recursive
+goto:eof
+
+:download_third_party
+powershell.exe (new-object System.Net.WebClient).DownloadFile('https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz', ^
+'%workspace%third-party-05b862.tar.gz')
+goto:eof
+
+:rm_rebuild_dir
+    del /f /s /q "%~1\*.*"  >nul 2>&1
+    rd /s /q  "%~1" >nul 2>&1
+goto:eof
+
+
+:remove_space
+:remove_left_space
+if "%tmp_var:~0,1%"==" " (
+    set "tmp_var=%tmp_var:~1%"
+    goto remove_left_space
+)
+
+:remove_right_space
+if "%tmp_var:~-1%"==" " (
+    set "tmp_var=%tmp_var:~0,-1%"
+    goto remove_left_space
+)
+goto:eof
\ No newline at end of file
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -34,6 +34,15 @@ elseif(SSE3_FOUND)
    set(SIMD_FLAG ${SSE3_FLAG})
 endif()

+if(WIN32)
+  # windows header option for all targets.
+  add_definitions(-D_XKEYCHECK_H)
+  
+  if (NOT MSVC)
+    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
+  endif(NOT MSVC)
+endif(WIN32)
+
 if(LITE_WITH_CUDA)
    add_definitions(-DLITE_WITH_CUDA)
    add_definitions(-DEIGEN_USE_GPU)
@@ -134,6 +143,10 @@ if (LITE_WITH_NPU)
    add_definitions("-DLITE_WITH_NPU")
 endif()

+if (LITE_WITH_APU)
+    add_definitions("-DLITE_WITH_APU")
+endif()
+
 if (LITE_WITH_RKNPU)
    add_definitions("-DLITE_WITH_RKNPU")
 endif()

--- a/cmake/device/apu.cmake
+++ b/cmake/device/apu.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+
+if(NOT DEFINED APU_DDK_ROOT)
+    set(APU_DDK_ROOT $ENV{APU_DDK_ROOT})
+    if(NOT APU_DDK_ROOT)
+        message(FATAL_ERROR "Must set APU_DDK_ROOT or env APU_DDK_ROOT when LITE_WITH_APU=ON")
+    endif()
+endif()
+
+message(STATUS "APU_DDK_ROOT: ${APU_DDK_ROOT}")
+find_path(APU_DDK_INC NAMES NeuronAdapter.h
+  PATHS ${APU_DDK_ROOT}/include NO_DEFAULT_PATH)
+if(NOT APU_DDK_INC)
+  message(FATAL_ERROR "Can not find NeuronAdapter.h in ${APU_DDK_ROOT}/include")
+endif()
+message(STATUS "APU_DDK_INC: ${APU_DDK_INC}")
+
+include_directories("${APU_DDK_ROOT}/include")
+
+set(APU_SUB_LIB_PATH "lib64")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(APU_SUB_LIB_PATH "lib64")
+endif()
+
+find_library(APU_NEURON_FILE NAMES neuron
+  PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
+
+find_library(APU_NEURON_ADAPTER_FILE NAMES neuron_adapter
+  PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
+
+if(NOT APU_NEURON_FILE)
+  message(FATAL_ERROR "Can not find APU_NEURON_FILE in ${APU_DDK_ROOT}")
+else()
+  message(STATUS "Found APU NEURON Library: ${APU_NEURON_FILE}")
+  add_library(apu_neuron SHARED IMPORTED GLOBAL)
+  set_property(TARGET apu_neuron PROPERTY IMPORTED_LOCATION ${APU_NEURON_FILE})
+endif()
+
+if(NOT APU_NEURON_ADAPTER_FILE)
+  message(FATAL_ERROR "Can not find APU_NEURON_ADAPTER_FILE in ${APU_DDK_ROOT}")
+else()
+  message(STATUS "Found APU NEURON ADAPTER Library: ${APU_NEURON_ADAPTER_FILE}")
+  add_library(apu_neuron_adapter SHARED IMPORTED GLOBAL)
+  set_property(TARGET apu_neuron_adapter PROPERTY IMPORTED_LOCATION ${APU_NEURON_ADAPTER_FILE})
+endif()
+
+set(apu_runtime_libs apu_neuron apu_neuron_adapter CACHE INTERNAL "apu runtime libs")
+message(STATUS "${apu_runtime_libs}")
+
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -32,6 +32,7 @@ IF(WIN32)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
+    SET(MKLML_SHARED_LIB_DEPS     ${MKLML_LIB_DIR}/msvcr120.dll)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
 ELSEIF(APPLE)
    #TODO(intel-huying):

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -88,6 +88,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_APU)
+    foreach(var ${lite_deps_APU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  if (LITE_WITH_RKNPU)
    foreach(var ${lite_deps_RKNPU_DEPS})
      set(deps ${deps} ${var})
@@ -137,7 +143,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -153,6 +159,7 @@ function(lite_cc_library TARGET)
            CV_DEPS ${args_CV_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
+            APU_DEPS ${args_APU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -168,8 +175,10 @@ function(lite_cc_library TARGET)
    else()
        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    endif()
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)

+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
    # collect targets need to compile for lite
    if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS)
        add_dependencies(lite_compile_deps ${TARGET})
@@ -184,7 +193,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -197,6 +206,7 @@ function(lite_cc_binary TARGET)
            ARM_DEPS ${args_ARM_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
+            APU_DEPS ${args_APU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
            RKNPU_DEPS ${args_RKNPU_DEPS}
            BM_DEPS ${args_BM_DEPS}
@@ -207,7 +217,9 @@ function(lite_cc_binary TARGET)
            MLU_DEPS ${args_MLU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
    if (NOT APPLE)
        # strip binary target to reduce size
        if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
@@ -234,7 +246,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -255,6 +267,7 @@ function(lite_cc_test TARGET)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
@@ -272,7 +285,9 @@ function(lite_cc_test TARGET)
                "${TARGET}"
                COMMENT "Strip debug symbols done on final executable file.")
    endif()
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    if(NOT WIN32)
+      target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    endif()
    file(APPEND ${offline_test_registry_file} "${TARGET}\n")

    # collect targets need to compile for lite
@@ -286,6 +301,7 @@ set(x86_kernels CACHE INTERNAL "x86 kernels")
 set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
+set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
@@ -305,12 +321,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM, RKNPU)
+# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -368,6 +384,15 @@ function(add_kernel TARGET device level)
        endif()
        set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "APU")
+        if (NOT LITE_WITH_APU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(apu_kernels "${apu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "XPU")
        if (NOT LITE_WITH_XPU)
            foreach(src ${args_SRCS})
@@ -451,6 +476,7 @@ function(add_kernel TARGET device level)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
@@ -473,7 +499,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -506,6 +532,7 @@ function(add_operator TARGET level)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
@@ -516,6 +543,29 @@ function(add_operator TARGET level)
      )
 endfunction()

+#only for windows 
+function(create_static_lib TARGET_NAME)
+  set(libs ${ARGN})
+  list(REMOVE_DUPLICATES libs)
+    set(dummy_index 1)
+    set(dummy_offset 1)
+    # the dummy target would be consisted of limit size libraries
+    set(dummy_limit 60)
+    list(LENGTH libs libs_len)
+
+    foreach(lib ${libs})
+      list(APPEND dummy_list ${lib})
+      list(LENGTH dummy_list listlen)
+      if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len}))
+        merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list})
+        set(dummy_list)
+        list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index})
+        MATH(EXPR dummy_index "${dummy_index}+1")
+      endif()
+      MATH(EXPR dummy_offset "${dummy_offset}+1")
+    endforeach()
+    merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list})
+endfunction()

 # Bundle several static libraries into one.
 function(bundle_static_library tgt_name bundled_tgt_name fake_target)
@@ -559,7 +609,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target)
  set(bundled_tgt_full_name
    ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX})

-  #message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}")
+  message(STATUS "bundled_tgt_full_name:  ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  
+  if(WIN32)
+    set(dummy_tgt_name dummy_${bundled_tgt_name})
+    create_static_lib(${bundled_tgt_name} ${static_libs})
+    add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_name})
+    add_dependencies(${fake_target} ${tgt_name})
+  
+    add_library(${dummy_tgt_name} STATIC IMPORTED)
+    set_target_properties(${dummy_tgt_name}
+      PROPERTIES
+        IMPORTED_LOCATION ${bundled_tgt_full_name}
+        INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:${tgt_name},INTERFACE_INCLUDE_DIRECTORIES>)
+    add_dependencies(${dummy_tgt_name} ${fake_target})
+    return()
+  endif()

  if(NOT IOS)
    file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
+message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
 message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
@@ -71,6 +72,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
    if (LITE_WITH_XPU)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu")
    endif(LITE_WITH_XPU)
+    if (LITE_WITH_APU)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.apu")
+    endif(LITE_WITH_APU)
    if (LITE_WITH_FPGA)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
    endif(LITE_WITH_FPGA)
@@ -87,6 +91,38 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")

 # add python lib
 if (LITE_WITH_PYTHON)
+    if(WIN32)   
+        set(LITE_CORE "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd")
+        set(LITE_CORE_DEPS ${LITE_CORE})
+        add_custom_command(OUTPUT   ${LITE_CORE}
+            COMMAND cmake -E copy $<TARGET_FILE:lite_pybind> ${LITE_CORE}
+            DEPENDS lite_pybind)
+        add_custom_target(copy_lite_pybind ALL DEPENDS ${LITE_CORE_DEPS})
+        
+        add_custom_target(publish_inference_python_lib ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.pyd"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.pyd"
+            DEPENDS copy_lite_pybind
+            )
+            
+        add_custom_target(publish_inference_python_installer ${TARGET}
+            COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
+            DEPENDS  publish_inference_python_lib)
+        add_custom_target(publish_inference_python_light_demo ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/python"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_full_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
+            )
+        add_dependencies(publish_inference publish_inference_python_lib)
+        add_dependencies(publish_inference publish_inference_python_installer)
+        add_dependencies(publish_inference publish_inference_python_light_demo)
+    else()
    if(APPLE)
        add_custom_target(publish_inference_python_lib ${TARGET}
                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
@@ -123,6 +159,7 @@ if (LITE_WITH_PYTHON)
    add_dependencies(publish_inference publish_inference_python_lib)
    add_dependencies(publish_inference publish_inference_python_installer)
    add_dependencies(publish_inference publish_inference_python_light_demo)
+    endif(WIN32)
 endif()

 if (LITE_WITH_CUDA OR LITE_WITH_X86)
@@ -141,7 +178,7 @@ if (LITE_WITH_CUDA OR LITE_WITH_X86)
        add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
        add_dependencies(publish_inference publish_inference_cxx_lib)
        add_dependencies(publish_inference publish_inference_third_party)
-    else()
+    elseif(NOT WIN32)
        add_custom_target(publish_inference_cxx_lib ${TARGET}
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
@@ -164,6 +201,36 @@ if (LITE_WITH_CUDA OR LITE_WITH_X86)
 endif()

 if (LITE_WITH_X86)
+  if(WIN32)
+        add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_place.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_passes.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_lite_factory_helper.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_full_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_light_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        )
+
+        add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
+        add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
+        add_dependencies(publish_inference publish_inference_x86_cxx_lib)
+
+        add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+        )
+        add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
+        add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3)
+
+  else()
+
    add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
@@ -178,6 +245,7 @@ if (LITE_WITH_X86)
    add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
    add_dependencies(publish_inference publish_inference_x86_cxx_lib)
    add_dependencies(publish_inference publish_inference_x86_cxx_demos)
+  endif()
 endif()

 if(LITE_WITH_CUDA)

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -23,6 +23,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
            add_dependencies(paddle_full_api_shared dynload_mklml)
        endif()
+        if(WIN32)
+             target_link_libraries(paddle_full_api_shared shlwapi.lib)
+        endif()
    endif()
    if(LITE_WITH_CUDA)
        target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
@@ -34,12 +37,13 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                  ARM_DEPS ${arm_kernels}
                  CV_DEPS paddle_cv_arm
                  NPU_DEPS ${npu_kernels}
+                  APU_DEPS ${apu_kernels}
                  RKNPU_DEPS ${rknpu_kernels}
                  )

    add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels})
-    if(NOT APPLE)
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
+    if(NOT APPLE AND NOT WIN32)
        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
        set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
        add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
@@ -78,7 +82,9 @@ if (WITH_TESTING)
      XPU_DEPS ${xpu_kernels}
      RKNPU_DEPS ${rknpu_kernels}
      BM_DEPS ${bm_kernels}
-      MLU_DEPS ${mlu_kernels})
+      MLU_DEPS ${mlu_kernels}
+      APU_DEPS ${apu_kernels})
+
 endif()
 if(LITE_WITH_FPGA)
    set(light_api_deps ${light_api_deps} ${fpga_deps})
@@ -103,6 +109,7 @@ message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get OpenCL kernels ${opencl_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
+message(STATUS "get APU kernels ${apu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
@@ -122,6 +129,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        CV_DEPS paddle_cv_arm
                        NPU_DEPS ${npu_kernels}
                        XPU_DEPS ${xpu_kernels}
+                        APU_DEPS ${apu_kernels}
                        RKNPU_DEPS ${rknpu_kernels}
                        BM_DEPS ${bm_kernels}
                        CL_DEPS ${opencl_kernels}
@@ -143,6 +151,7 @@ lite_cc_library(light_api SRCS light_api.cc
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
        XPU_DEPS ${xpu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
        CL_DEPS ${opencl_kernels}
@@ -163,6 +172,7 @@ if(WITH_TESTING)
       ARM_DEPS ${arm_kernels}
       CV_DEPS paddle_cv_arm
       NPU_DEPS ${npu_kernels}
+       APU_DEPS ${apu_kernels}
       XPU_DEPS ${xpu_kernels}
       RKNPU_DEPS ${rknpu_kernels}
       CL_DEPS ${opencl_kernels}
@@ -220,7 +230,7 @@ if(WITH_TESTING)
 endif()

 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
-    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${fpga_kernels})
+    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels})

    lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc
       DEPS ${lite_model_test_DEPS}
@@ -292,6 +302,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels})
@@ -356,6 +367,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
  CV_DEPS paddle_cv_arm
  NPU_DEPS ${npu_kernels}
  XPU_DEPS ${xpu_kernels}
+  APU_DEPS ${apu_kernels}
  RKNPU_DEPS ${rknpu_kernels}
  CL_DEPS ${opencl_kernels}
  X86_DEPS ${x86_kernels}
@@ -376,6 +388,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        RKNPU_DEPS ${rknpu_kernels}
@@ -390,6 +403,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        RKNPU_DEPS ${rknpu_kernels}
@@ -404,6 +418,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        RKNPU_DEPS ${rknpu_kernels}
@@ -419,6 +434,7 @@ if(NOT IOS)
        XPU_DEPS ${xpu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
@@ -429,6 +445,7 @@ if(NOT IOS)
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
        XPU_DEPS ${xpu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
@@ -445,6 +462,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        RKNPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -13,7 +13,13 @@
 // limitations under the License.

 #include <gflags/gflags.h>
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#include <windows.h>
+#include "lite/backends/x86/port.h"
+#endif
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include <time.h>
 #include <algorithm>
 #include <cstdio>

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -19,6 +19,11 @@
 #include "lite/api/paddle_api.h"
 #include "lite/core/device_info.h"
 #include "lite/core/version.h"
+
+#ifndef LITE_ON_TINY_PUBLISH
+#include "lite/api/paddle_use_passes.h"
+#endif
+
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
 #include <omp.h>

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -82,7 +82,7 @@ Tensor* LightPredictor::GetInputByName(const std::string& name) {
  if (element == input_names_.end()) {
    LOG(ERROR) << "Model do not have input named with: [" << name
               << "], model's inputs include:";
-    for (int i = 0; i < input_names_.size(); i++) {
+    for (size_t i = 0; i < input_names_.size(); i++) {
      LOG(ERROR) << "[" << input_names_[i] << "]";
    }
    return nullptr;
@@ -114,7 +114,7 @@ void LightPredictor::PrepareFeedFetch() {
  auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
  std::vector<cpp::OpDesc*> feeds;
  std::vector<cpp::OpDesc*> fetchs;
-  for (int i = 0; i < current_block->OpsSize(); i++) {
+  for (size_t i = 0; i < current_block->OpsSize(); i++) {
    auto op = current_block->GetOp<cpp::OpDesc>(i);
    if (op->Type() == "feed") {
      feeds.push_back(op);
@@ -124,11 +124,11 @@ void LightPredictor::PrepareFeedFetch() {
  }
  input_names_.resize(feeds.size());
  output_names_.resize(fetchs.size());
-  for (int i = 0; i < feeds.size(); i++) {
+  for (size_t i = 0; i < feeds.size(); i++) {
    input_names_[feeds[i]->GetAttr<int>("col")] =
        feeds[i]->Output("Out").front();
  }
-  for (int i = 0; i < fetchs.size(); i++) {
+  for (size_t i = 0; i < fetchs.size(); i++) {
    output_names_[fetchs[i]->GetAttr<int>("col")] =
        fetchs[i]->Input("X").front();
  }

--- a/lite/api/light_api_test.cc
+++ b/lite/api/light_api_test.cc
@@ -37,11 +37,11 @@ TEST(LightAPI, load) {
  const std::vector<std::string> inputs = predictor.GetInputNames();

  LOG(INFO) << "input size: " << inputs.size();
-  for (int i = 0; i < inputs.size(); i++) {
+  for (size_t i = 0; i < inputs.size(); i++) {
    LOG(INFO) << "inputnames: " << inputs[i];
  }
  const std::vector<std::string> outputs = predictor.GetOutputNames();
-  for (int i = 0; i < outputs.size(); i++) {
+  for (size_t i = 0; i < outputs.size(); i++) {
    LOG(INFO) << "outputnames: " << outputs[i];
  }


--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
@@ -293,13 +293,13 @@ int main(int argc, char** argv) {

  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
  std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
    input_shapes.push_back(get_shape(str_input_shapes[i]));
  }
  std::vector<std::string> str_input_shapes_0 =
      split_string(FLAGS_input_shape_0);
  std::vector<std::vector<int64_t>> input_shapes_0;
-  for (int i = 0; i < str_input_shapes_0.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes_0.size(); ++i) {
    input_shapes_0.push_back(get_shape(str_input_shapes_0[i]));
  }


--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -44,9 +44,15 @@ void OutputOptModel(const std::string& load_model_dir,
                    const std::vector<std::vector<int64_t>>& input_shapes) {
  lite_api::CxxConfig config;
  config.set_model_dir(load_model_dir);
+#ifdef LITE_WITH_X86
+  config.set_valid_places({Place{TARGET(kX86), PRECISION(kFloat)},
+                           Place{TARGET(kX86), PRECISION(kInt64)},
+                           Place{TARGET(kHost), PRECISION(kFloat)}});
+#else
  config.set_valid_places({
      Place{TARGET(kARM), PRECISION(kFloat)},
  });
+#endif
  auto predictor = lite_api::CreatePaddlePredictor(config);

  // delete old optimized model
@@ -198,7 +204,7 @@ int main(int argc, char** argv) {
  LOG(INFO) << "input shapes: " << FLAGS_input_shape;
  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
  std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
    LOG(INFO) << "input shape: " << str_input_shapes[i];
    input_shapes.push_back(get_shape(str_input_shapes[i]));
  }

--- a/lite/api/model_test_classify.cc
+++ b/lite/api/model_test_classify.cc
@@ -310,7 +310,7 @@ int main(int argc, char** argv) {
  LOG(INFO) << "input shapes: " << FLAGS_input_shape;
  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
  std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
    LOG(INFO) << "input shape: " << str_input_shapes[i];
    input_shapes.push_back(get_shape(str_input_shapes[i]));
  }

--- a/lite/api/model_test_detection.cc
+++ b/lite/api/model_test_detection.cc
@@ -114,7 +114,7 @@ void detect_object(const float* dout,
  }
  std::string name = FLAGS_out_txt + "_accu.txt";
  FILE* fp = fopen(name.c_str(), "w");
-  for (int i = 0; i < objects.size(); ++i) {
+  for (size_t i = 0; i < objects.size(); ++i) {
    Object object = objects.at(i);
    if (object.prob > thresh && object.x > 0 && object.y > 0 &&
        object.width > 0 && object.height > 0) {
@@ -324,7 +324,7 @@ int main(int argc, char** argv) {
  LOG(INFO) << "input shapes: " << FLAGS_input_shape;
  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
  std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
    LOG(INFO) << "input shape: " << str_input_shapes[i];
    input_shapes.push_back(get_shape(str_input_shapes[i]));
  }

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -104,17 +104,21 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(
          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
    } else if (target_repr == "x86") {
-      valid_places.emplace_back(TARGET(kX86));
+      valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kFloat)});
+      valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
    } else if (target_repr == "npu") {
      valid_places.emplace_back(TARGET(kNPU));
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "mlu") {
+      valid_places.emplace_back(TARGET(kMLU));
    } else if (target_repr == "rknpu") {
      valid_places.emplace_back(TARGET(kRKNPU));
      valid_places.emplace_back(
          TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
-    } else if (target_repr == "mlu") {
-      valid_places.emplace_back(TARGET(kMLU));
+    } else if (target_repr == "apu") {
+      valid_places.emplace_back(
+          Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "
@@ -192,6 +196,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                      "kNPU",
                                      "kXPU",
                                      "kRKNPU",
+                                      "kAPU",
                                      "kAny",
                                      "kUnk"};
  int maximum_optype_length = 0;
@@ -256,16 +261,16 @@ void PrintHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
      "  Display operators in the input model\n";
  std::cout << "opt version:" << opt_version << std::endl
            << help_info << std::endl;

--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -63,6 +63,13 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
      valid_places_.emplace_back(TARGET(kNPU));
    } else if (target_repr == "xpu") {
      valid_places_.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "rknpu") {
+      valid_places_.emplace_back(TARGET(kRKNPU));
+      valid_places_.emplace_back(
+          TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
+    } else if (target_repr == "apu") {
+      valid_places_.emplace_back(
+          Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "
@@ -183,7 +190,7 @@ void OptBase::PrintHelpInfo() {
      "        `set_param_file(param_file_path)`\n"
      "        `set_model_type(protobuf|naive_buffer)`\n"
      "        `set_optimize_out(output_optimize_model_dir)`\n"
-      "        `set_valid_places(arm|opencl|x86|npu|xpu)`\n"
+      "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
      "        `run_optimize(false|true)`\n"
      "        `  ----fasle&true refer to whether to record ops info for "
      "tailoring lib, false by default`\n"
@@ -208,6 +215,8 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
                                                     "kFPGA",
                                                     "kNPU",
                                                     "kXPU",
+                                                     "kRKNPU",
+                                                     "kAPU",
                                                     "kAny",
                                                     "kUnk"};
  // Get the lengh of the first column: maximum length of the op_type

--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -36,11 +36,11 @@ TEST(CxxApi, run) {

  auto inputs = predictor->GetInputNames();
  LOG(INFO) << "input size: " << inputs.size();
-  for (int i = 0; i < inputs.size(); i++) {
+  for (size_t i = 0; i < inputs.size(); i++) {
    LOG(INFO) << "inputnames: " << inputs[i];
  }
  auto outputs = predictor->GetOutputNames();
-  for (int i = 0; i < outputs.size(); i++) {
+  for (size_t i = 0; i < outputs.size(); i++) {
    LOG(INFO) << "outputnames: " << outputs[i];
  }
  auto input_tensor = predictor->GetInputByName(inputs[0]);

--- a/lite/api/paddle_lite_factory_helper.h
+++ b/lite/api/paddle_lite_factory_helper.h
@@ -18,20 +18,21 @@
 */
 #pragma once

-#define USE_LITE_OP(op_type__)                                   \
-  extern int touch_op_##op_type__();                             \
-  int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \
-      touch_op_##op_type__();
+// some platform-independent defintion
+#include "lite/utils/macros.h"
+
+#define USE_LITE_OP(op_type__)       \
+  extern int touch_op_##op_type__(); \
+  int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__();

 #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
  extern int touch_##op_type__##target__##precision__##layout__##alias__();  \
  int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \
-      __attribute__((unused)) =                                              \
-          touch_##op_type__##target__##precision__##layout__##alias__();
+      UNUSED = touch_##op_type__##target__##precision__##layout__##alias__();

-#define USE_MIR_PASS(name__)                                   \
-  extern bool mir_pass_registry##name__##_fake();              \
-  static bool mir_pass_usage##name__ __attribute__((unused)) = \
+#define USE_MIR_PASS(name__)                      \
+  extern bool mir_pass_registry##name__##_fake(); \
+  static bool mir_pass_usage##name__ UNUSED =     \
      mir_pass_registry##name__##_fake();

 #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -73,7 +73,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "xpu",
                                              "bm",
                                              "mlu",
-                                              "rknpu"};
+                                              "rknpu",
+                                              "apu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -113,9 +114,10 @@ const std::string& TargetRepr(TargetType target) {
                                              "kFPGA",
                                              "kNPU",
                                              "kXPU",
-                                              "kMLU",
                                              "kBM",
-                                              "kRKNPU"};
+                                              "kMLU",
+                                              "kRKNPU",
+                                              "kAPU"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -158,6 +160,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kXPU),
                                               TARGET(kBM),
                                               TARGET(kMLU),
+                                               TARGET(kAPU),
                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -49,14 +49,15 @@ enum class TargetType : int {
  kCUDA = 3,
  kARM = 4,
  kOpenCL = 5,
+  kAny = 6,  // any target
  kFPGA = 7,
  kNPU = 8,
  kXPU = 9,
  kBM = 10,
  kMLU = 11,
  kRKNPU = 12,
-  kAny = 6,  // any target
-  NUM = 13,  // number of fields.
+  kAPU = 13,
+  NUM = 14,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -49,6 +49,7 @@ USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(mlu_subgraph_pass);
 USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
+USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
--- a/lite/api/python/__init__.py
+++ b/lite/api/python/__init__.py
@@ -11,3 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import os 
+import sys
+
+if os.name =='nt':
+    current_path = os.path.abspath(os.path.dirname(__file__))
+    third_lib_path = current_path + os.sep + 'libs'
+    os.environ['path'] =  third_lib_path+ ';' + os.environ['path']
+    sys.path.insert(0, third_lib_path)
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -3,7 +3,14 @@ if (NOT LITE_ON_TINY_PUBLISH)
   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
 endif()

-lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+if(WIN32)
+   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+   target_link_libraries(lite_pybind ${os_dependency_modules})
+else()
+   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+endif(WIN32)
+
 if (LITE_ON_TINY_PUBLISH)
   set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
 endif()
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -183,6 +183,8 @@ void BindLitePlace(py::module *m) {
      .value("FPGA", TargetType::kFPGA)
      .value("NPU", TargetType::kNPU)
      .value("MLU", TargetType::kMLU)
+      .value("RKNPU", TargetType::kRKNPU)
+      .value("APU", TargetType::kAPU)
      .value("Any", TargetType::kAny);

  // PrecisionType

--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
@@ -34,20 +34,27 @@ else:

 # core lib of paddlelite is stored as lite.so
 LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
-PACKAGE_DATA = {'paddlelite': ['lite.so']}
+PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']}
 # put all thirdparty libraries in paddlelite.libs
 PACKAGE_DATA['paddlelite.libs'] = []
 LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
 if '${WITH_MKL}' == 'ON':
    shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
    shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
-    PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
-
+    if os.name != 'nt':
+        PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
+    else:
+        PACKAGE_DATA['paddlelite.libs'] += ['libiomp5md.dll', 'mklml.dll']
+        shutil.copy('${MKLML_SHARED_LIB_DEPS}', LIB_PATH)
+        PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll']
 # link lite.so to paddlelite.libs
-COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
-/inference_lite_lib/python/install/lite/lite.so"
-if os.system(COMMAND) != 0:
-    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+if os.name != 'nt':
+    COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
+    /inference_lite_lib/python/install/lite/lite.so"
+    if os.system(COMMAND) != 0:
+        raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+
+  

 # remove unused paddle/libs/__init__.py
 if os.path.isfile(LIB_PATH+'/__init__.py'):
@@ -61,6 +68,14 @@ PACKAGE_DIR = {
    'paddlelite': LITE_PATH
 }

+if os.name == 'nt':
+    # fix the path separator under windows
+    fix_package_dir = {}
+    for k, v in PACKAGE_DIR.items():
+        fix_package_dir[k] = v.replace('/', '\\')
+    PACKAGE_DIR = fix_package_dir
+
+
 setup(
    name='paddlelite',
    version=PADDLELITE_VERSION,

--- a/lite/api/test_googlenet_lite.cc
+++ b/lite/api/test_googlenet_lite.cc
@@ -38,7 +38,7 @@ TEST(CXXApi, test_lite_googlenet) {
  input_tensor->Resize(input_shape);
  auto* data = input_tensor->mutable_data<float>();
  int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
    input_num *= input_shape[i];
  }
  for (int i = 0; i < input_num; i++) {
@@ -69,7 +69,7 @@ TEST(CXXApi, test_lite_googlenet) {
  for (size_t i = 0; i < results.size(); ++i) {
    EXPECT_NEAR(out->data<float>()[i * 51], results[i], 1e-5);
  }
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
  ASSERT_EQ(out->shape()[0], 1);
  ASSERT_EQ(out->shape()[1], 1000);
 }

--- a/lite/api/test_helper.h
+++ b/lite/api/test_helper.h
@@ -15,7 +15,12 @@
 #pragma once

 #include <gflags/gflags.h>
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#include <windows.h>
+#include "lite/backends/x86/port.h"
+#endif
 #include <time.h>
 #include <cmath>


--- a/lite/api/test_inceptionv4_lite_x86.cc
+++ b/lite/api/test_inceptionv4_lite_x86.cc
@@ -38,7 +38,7 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) {
  input_tensor->Resize(input_shape);
  auto* data = input_tensor->mutable_data<float>();
  int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
    input_num *= input_shape[i];
  }
  for (int i = 0; i < input_num; i++) {
@@ -69,13 +69,13 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) {
       0.0010612885,  0.00089107914, 0.0010112736,  0.00097655767}));

  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
  ASSERT_EQ(out->shape()[0], 1);
  ASSERT_EQ(out->shape()[1], 1000);

  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                  results[i][j],
                  1e-6);

--- a/lite/api/test_mobilenetv1_lite_x86.cc
+++ b/lite/api/test_mobilenetv1_lite_x86.cc
@@ -38,7 +38,7 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
  input_tensor->Resize(input_shape);
  auto* data = input_tensor->mutable_data<float>();
  int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
    input_num *= input_shape[i];
  }
  for (int i = 0; i < input_num; i++) {
@@ -68,13 +68,13 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
       0.0048292773,  0.0013995157,  0.0018453331,  0.0002428986,
       0.00020211363, 0.00013668182, 0.0005855956,  0.00025901722}));
  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
  ASSERT_EQ(out->shape()[0], 1);
  ASSERT_EQ(out->shape()[1], 1000);

  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                  results[i][j],
                  1e-6);

--- a/lite/api/test_mobilenetv2_lite_x86.cc
+++ b/lite/api/test_mobilenetv2_lite_x86.cc
@@ -39,7 +39,7 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
  input_tensor->Resize(input_shape);
  auto* data = input_tensor->mutable_data<float>();
  int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
    input_num *= input_shape[i];
  }
  for (int i = 0; i < input_num; i++) {
@@ -69,13 +69,13 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
       0.0070957416,  0.0016094646,  0.0018807327,  0.00010506048,
       6.823785e-05,  0.00012269315, 0.0007806194,  0.00022354358}));
  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
  ASSERT_EQ(out->shape()[0], 1);
  ASSERT_EQ(out->shape()[1], 1000);

  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                  results[i][j],
                  1e-6);

--- a/lite/api/test_resnet50_lite_x86.cc
+++ b/lite/api/test_resnet50_lite_x86.cc
@@ -38,7 +38,7 @@ TEST(Resnet50, test_resnet50_lite_x86) {
  input_tensor->Resize(input_shape);
  auto* data = input_tensor->mutable_data<float>();
  int input_num = 1;
-  for (int i = 0; i < input_shape.size(); ++i) {
+  for (size_t i = 0; i < input_shape.size(); ++i) {
    input_num *= input_shape[i];
  }
  for (int i = 0; i < input_num; i++) {
@@ -69,13 +69,13 @@ TEST(Resnet50, test_resnet50_lite_x86) {
       0.006387163,   0.0037145028,  0.0012812682,  0.00045948103,
       0.00013535398, 0.0002483765,  0.00076759676, 0.0002773295}));
  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape().size(), 2u);
  ASSERT_EQ(out->shape()[0], 1);
  ASSERT_EQ(out->shape()[1], 1000);

  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
                  results[i][j],
                  1e-6);

--- a/lite/api/transform_test.cc
+++ b/lite/api/transform_test.cc
@@ -232,8 +232,8 @@ void TestModel(const std::vector<Place>& valid_places,
    for (int i = 0; i < outs->numel(); ++i) {
      LOG(INFO) << o_data[i];
    }
-    for (int i = 0; i < lod.size(); ++i) {
-      for (int j = 0; j < lod[i].size(); ++j) {
+    for (size_t i = 0; i < lod.size(); ++i) {
+      for (size_t j = 0; j < lod[i].size(); ++j) {
        LOG(INFO) << lod[i][j];
      }
    }

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -8,4 +8,5 @@ add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
 add_subdirectory(bm)
+add_subdirectory(apu)
 add_subdirectory(rknpu)
--- a/lite/backends/apu/CMakeLists.txt
+++ b/lite/backends/apu/CMakeLists.txt
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+
+lite_cc_library(device_apu SRCS device.cc)
--- a/lite/backends/apu/device.cc
+++ b/lite/backends/apu/device.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/apu/device.h"
+#include <dlfcn.h>
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace apu {
+
+inline void* LoadFunc(void* libHandle, const char* name) {
+  CHECK(libHandle != nullptr);
+  CHECK(name != nullptr);
+  void* fn = dlsym(libHandle, name);
+  if (fn == nullptr) {
+    LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
+                 << "] Because " << dlerror();
+  }
+  return fn;
+}
+
+NeuronCompilation* Device::Build(void* libHandle, NeuronModel* model) {
+  typedef int (*NeuronCompilation_create)(NeuronModel * model,
+                                          NeuronCompilation * *compilation);
+  typedef void (*NeuronCompilation_free)(NeuronCompilation * compilation);
+  typedef int (*NeuronCompilation_finish)(NeuronCompilation * compilation);
+
+#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
+  FUNC_NAME VARIABLE_NAME =                                 \
+      reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_create, neuron_compilation_create)
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_free, neuron_compilation_free)
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_finish, neuron_compilation_finish)
+#undef LOAD_FUNCTIONS
+
+  int neuron_errCode = 0;
+  NeuronCompilation* compilation = NULL;
+
+  VLOG(3) << "[APU] Compile model";
+
+  neuron_errCode = (*neuron_compilation_create)(model, &compilation);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "[APU] create compile failed! " << neuron_errCode;
+    return nullptr;
+  }
+
+  neuron_errCode = (*neuron_compilation_finish)(compilation);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "[APU] compile failed! " << neuron_errCode;
+    return nullptr;
+  }
+
+  VLOG(3) << "[APU] Build done";
+  return compilation;
+}
+
+}  // namespace apu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/apu/device.h
+++ b/lite/backends/apu/device.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "NeuronAdapter.h"  // NOLINT
+
+namespace paddle {
+namespace lite {
+namespace apu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {}
+
+  NeuronCompilation* Build(void* libHandle, NeuronModel* model);
+};
+
+}  // namespace apu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input,
                          const operators::ConvParam& param,
                          ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h0 = (*param.paddings)[0];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);

@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input,
  int tile_h = (hout + 5) / 6;
  int size_tile = tile_h * tile_w;

-  int w_pad = win + pad_w * 2;
-  int h_pad = hin + pad_h * 2;
+  int w_pad = win + pad_w0 + pad_w1;
+  int h_pad = hin + pad_h0 + pad_h1;

  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
-                             hin + pad_h,
-                             -pad_w,
-                             win + pad_w,
+                             -pad_h0,
+                             hin + pad_h1,
+                             -pad_w0,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,
@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input,
                          const operators::ConvParam& param,
                          ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h0 = (*param.paddings)[0];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);

@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input,
  int tile_h = (hout + 1) / 2;
  int size_tile = tile_h * tile_w;

-  int w_pad = win + pad_w * 2;
-  int h_pad = hin + pad_h * 2;
+  int w_pad = win + pad_w0 + pad_w1;
+  int h_pad = hin + pad_h0 + pad_h1;

  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
-                             hin + pad_h,
-                             -pad_w,
-                             win + pad_w,
+                             -pad_h0,
+                             hin + pad_h1,
+                             -pad_w0,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,
@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input,
                                const operators::ConvParam& param,
                                ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h0 = (*param.paddings)[0];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);

@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input,
  int tile_h = (hout + 1) / 2;
  int size_tile = tile_h * tile_w;

-  int w_pad = win + pad_w * 2;
-  int h_pad = hin + pad_h * 2;
+  int w_pad = win + pad_w0 + pad_w1;
+  int h_pad = hin + pad_h0 + pad_h1;

  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
-                             hin + pad_h,
-                             -pad_w,
-                             win + pad_w,
+                             -pad_h0,
+                             hin + pad_h1,
+                             -pad_w0,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,

--- a/lite/backends/arm/math/lstm.cc
+++ b/lite/backends/arm/math/lstm.cc
@@ -33,6 +33,7 @@ void add_bias_rowwise(Tensor* input,
    for (int w = start_w; w < w_adds; ++w) {
      i_data[w] += b_data[w];
    }
+    i_data += width;
  }
 }
 void vector_dot(
@@ -67,15 +68,8 @@ void vector_dot(
  for (int i = 0; i < remain; ++i) {
    if (!v2) {
      out_ptr[i] = in_ptr[i] * v1_ptr[i];
-      ++out_ptr;
-      ++in_ptr;
-      ++v1_ptr;
    } else {
      out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i];
-      ++out_ptr;
-      ++in_ptr;
-      ++v1_ptr;
-      ++v2_ptr;
    }
  }
 }

--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -28,6 +28,7 @@ namespace lite {
 class CLContext {
 public:
  ~CLContext() {
+    GetCommandQueue().finish();
    for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
      // Note(ysh329): Don't need `clReleaseKernel`
      kernels_[kidx].reset();

--- a/lite/backends/opencl/cl_functions_test.cc
+++ b/lite/backends/opencl/cl_functions_test.cc
@@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) {
  size_t width = in_image.ImageWidth();
  size_t height = in_image.ImageHeight();
  auto global_work_size = cl::NDRange{width, height};
-  cl::Event event;
  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
  CL_CHECK_FATAL(status);
  status = context->GetCommandQueue().finish();
  CL_CHECK_FATAL(status);
+#if 0
  double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
  double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
+#endif
+
  LOG(INFO) << out_image;
 }


--- a/lite/backends/opencl/cl_image_converter.cc
+++ b/lite/backends/opencl/cl_image_converter.cc
@@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
            i2 += 4;
            p++;
          } else {
-            image[i2] = 0.0;
+            image[i2] = Float2Half(0.f);
            i2 += 4;
          }
        }
@@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
            image[index] = Float2Half(*p);
            p++;
          } else {
-            image[index] = 0.0;
+            image[index] = Float2Half(0.f);
          }
          if (index >= (width * height * 4)) {
            LOG(INFO) << " index out of range ";

--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once

 /////////////////////////////////
@@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
 #endif

 #ifdef RELU6
-  output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6);
+  in = fmax((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  output = fmin((CL_DTYPE4)(6.0f, 6.0f, 6.0f, 6.0f), in);
 #endif
  return output;
 }
--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -14,36 +14,30 @@ limitations under the License. */

 #include <cl_common.h>

-
 __kernel void relu(__read_only image2d_t input,
                   __write_only image2d_t output,
                   __private const float threshold,
                   __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height

-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  in = max((CL_DTYPE4)(0.0f), in);
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }

-
 __kernel void relu6(__read_only image2d_t input,
                    __write_only image2d_t output,
                    __private const float threshold,
-                   __private const float scale){
-
+                    __private const float scale) {
  const int x = get_global_id(0);
  const int y = get_global_id(1);

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
@@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input,
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }

-
 __kernel void sigmoid(__read_only image2d_t input,
                      __write_only image2d_t output,
                      __private const float threshold,
@@ -64,70 +57,66 @@ __kernel void sigmoid(__read_only image2d_t input,

  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out;
-  out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
-  out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
-  out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
-  out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
+
+  out.x = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.x))));
+  out.y = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.y))));
+  out.z = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.z))));
+  out.w = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.w))));

  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }

 __kernel void leaky_relu(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const float threshold,
-                      __private const float scale) {
+                         __write_only image2d_t output,
+                         __private const float threshold,
+                         __private const float scale) {
  const int x = get_global_id(0);
  const int y = get_global_id(1);

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in;
-  if (in.x < 0.0f){
+  if (in.x < 0.0f) {
    in.x = s_val.x;
  }
-  if (in.y < 0.0f){
+  if (in.y < 0.0f) {
    in.y = s_val.y;
  }
-  if (in.z < 0.0f){
+  if (in.z < 0.0f) {
    in.z = s_val.z;
  }
-  if (in.w < 0.0f){
+  if (in.w < 0.0f) {
    in.w = s_val.w;
  }
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }

 __kernel void tanh_act(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const float threshold,
-                      __private const float scale) {
-
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
+                       __write_only image2d_t output,
+                       __private const float threshold,
+                       __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in));
+  CL_DTYPE4 out = (exp(in) - exp(-in)) / (exp(in) + exp(-in));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }

 __kernel void exp_act(__read_only image2d_t input,
                      __write_only image2d_t output,
                      __private const float threshold,
-                   __private const float scale) {
-
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
+                      __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out = exp(in);
@@ -135,19 +124,16 @@ __kernel void exp_act(__read_only image2d_t input,
 }

 __kernel void swish(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const float threshold,
-                   __private const float scale) {
-
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
+                    __write_only image2d_t output,
+                    __private const float threshold,
+                    __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
-
--- a/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl
@@ -54,10 +54,10 @@ __kernel void bilinear_interp(__read_only image2d_t input,
    if (ceil_h > in_dims_h - 1) {
        ceil_h = in_dims_h- 1;
    }
-    float wight0_w = center_w - floor_w;
-    float wight0_h = center_h - floor_h;
-    float wight1_w = 1.0 - wight0_w;
-    float wight1_h = 1.0 - wight0_h;
+    CL_DTYPE wight0_w = center_w - floor_w;
+    CL_DTYPE wight0_h = center_h - floor_h;
+    CL_DTYPE wight1_w = 1.0 - wight0_w;
+    CL_DTYPE wight1_h = 1.0 - wight0_h;

    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                            CLK_ADDRESS_CLAMP |
@@ -92,5 +92,6 @@ __kernel void bilinear_interp(__read_only image2d_t input,
    CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h
            + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h;

+
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out);
 }
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
 #include <cl_common.h>

-__kernel void conv2d_1x1_opt(__private const int global_size_dim0,
-                         __private const int global_size_dim1,
-                         __private const int global_size_dim2,
-                         __read_only image2d_t input_image,
-                         __read_only image2d_t filter,
+__kernel void conv2d_1x1_opt(
+    __private const int global_size_dim0,
+    __private const int global_size_dim1,
+    __private const int global_size_dim2,
+    __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                         __read_only image2d_t new_biase,
+    __read_only image2d_t new_scale,
+    __read_only image2d_t new_biase,
 #endif
-                         __write_only image2d_t output_image,
-                         __private const int stride,
-                         __private const int offset,
-                         __private const int input_c_block,
-                         __private const int input_c_origin,
-                         __private const int dilation,
-                         __private const int input_width,  /* of one block */
-                         __private const int input_height, /* of one block */
-                         __private const int output_width,
-                         __private const int output_height,
-                         __private const int old_w) {
+    __write_only image2d_t output_image,
+    __private const int stride,
+    __private const int offset,
+    __private const int input_c_block,
+    __private const int input_c_origin,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width,
+    __private const int output_height,
+    __private const int old_w) {

  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
@@ -287,7 +288,7 @@ __kernel void conv2d_1x1_simple(
    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
+    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,
 #endif
    __write_only image2d_t output_image,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
@@ -18,7 +18,7 @@ limitations under the License. */
 ////////////////////////////////////////////////////////
 // buffer -> image2d
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d(__global CL_DTYPE *in,
+__kernel void buffer_to_image2d(__global CL_DTYPE* in,
                                __write_only image2d_t output_image,
                                __private const int out_H,
                                __private const int out_W,
@@ -26,7 +26,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
                                __private const int Stride0,
                                __private const int Stride1,
                                __private const int Stride2) {
-
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
@@ -66,16 +65,25 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,

 #ifdef DEBUG
  if (out_w > 2045) {
-    printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n",
-		   out_w,
-           out_C - 4 * out_c,
-           (float)(in[input_pos0]),
-           (float)(in[input_pos1]),
-           (float)(in[input_pos2]),
-           (float)(in[input_pos3]));
-    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh,
-           output_pos.x, output_pos.y,
-           (float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w));
+    printf(
+        "out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f "
+        "%.2f\n",
+        out_w,
+        out_C - 4 * out_c,
+        (float)(in[input_pos0]),
+        (float)(in[input_pos1]),
+        (float)(in[input_pos2]),
+        (float)(in[input_pos3]));
+    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n",
+           out_c,
+           out_w,
+           out_nh,
+           output_pos.x,
+           output_pos.y,
+           (float)(output.x),
+           (float)(output.y),
+           (float)(output.z),
+           (float)(output.w));
  }
 #endif

@@ -101,34 +109,42 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
  const int in_h = in_nh % in_height;

  const sampler_t sampler =
-    CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  const int pos_x = mad24(in_c, in_width, in_w);
-  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
+  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(
+      CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));

 #ifdef DEBUG
  if (in_w > 2045) {
-    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh,
-            pos_x, in_nh,
-           (float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w));
+    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n",
+           in_c,
+           in_w,
+           in_nh,
+           pos_x,
+           in_nh,
+           (float)(in.x),
+           (float)(in.y),
+           (float)(in.z),
+           (float)(in.w));
  }
 #endif

-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  const int index =
+      in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
  out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
  if (C - 4 * in_c >= 2) {
    out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
  }
-  if(C - 4 * in_c >= 3) {
+  if (C - 4 * in_c >= 3) {
    out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
  }
-  if(C - 4 * in_c >= 4) {
+  if (C - 4 * in_c >= 4) {
    out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
  }
 }

-
-#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
+#if 0  // NOTE(ysh329): keep, un-used from paddle-mobile
 ////////////////////////////////////////////////////////
 // buffer -> image2d_nw
 ////////////////////////////////////////////////////////
@@ -182,8 +198,7 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
 }
 #endif

-
-#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
+#if 0  // NOTE(ysh329): keep, un-used from paddle-mobile
 // image2d -> buffer
 __kernel void image2d_to_buffer_2d(__private const int in_height,
                                   __private const int in_width,
@@ -208,15 +223,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
 ////////////////////////////////////////////////////////
 // buffer -> image2d (divide by 255 to normalize)
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d_with_pre255(__global uchar *in,
+__kernel void buffer_to_image2d_with_pre255(__global uchar* in,
                                            __write_only image2d_t output_image,
                                            __private const int out_H,
                                            __private const int out_W,
                                            __private const int out_C,
                                            __private const int Stride0,
                                            __private const int Stride1,
-                                            __private const int Stride2){
-
+                                            __private const int Stride2) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
@@ -231,7 +245,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
  const int in_h = out_h;
  const int in_w = out_w;

-
  int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
  int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
  int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
@@ -243,30 +256,29 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,

  CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f;
  output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255;
-  if(out_C - 4 * out_c>=2){
-      output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
+  if (out_C - 4 * out_c >= 2) {
+    output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
  }
-  if(out_C - 4 * out_c>=3){
-      output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
+  if (out_C - 4 * out_c >= 3) {
+    output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
  }
-  if(out_C - 4 * out_c>=4){
-      output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
+  if (out_C - 4 * out_c >= 4) {
+    output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
  }
  WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
 }

-
 ////////////////////////////////////////////////////////
 // image2d -> buffer (multiply by 255 to de-normalize)
 ////////////////////////////////////////////////////////
 __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
-                                            __private const int in_width,
-                                            __private const int in_height,
-                                            __global uchar* out,
-                                            __private const int size_ch,
-                                            __private const int size_block,
-                                            __private const int size_batch,
-                                            __private const int C) {
+                                             __private const int in_width,
+                                             __private const int in_height,
+                                             __global uchar* out,
+                                             __private const int size_ch,
+                                             __private const int size_block,
+                                             __private const int size_batch,
+                                             __private const int C) {
  const int in_c = get_global_id(0);
  const int in_w = get_global_id(1);
  const int in_nh = get_global_id(2);
@@ -277,22 +289,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  const int pos_x = mad24(in_c, in_width, in_w);
-  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255;
+  CL_COMPUTE_DTYPE4 in =
+      READ_IMG_TYPE(
+          CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) *
+      255;

 #ifdef DEBUG
  printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n",
-          in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w);
+         in_c,
+         in_w,
+         in_nh,
+         pos_x,
+         in_nh,
+         in.x,
+         in.y,
+         in.z,
+         in.w);
 #endif

-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  const int index =
+      in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
  out[index] = convert_uchar_sat(in.x);
-  if(C - 4 * in_c>=2){
+  if (C - 4 * in_c >= 2) {
    out[index + size_ch] = convert_uchar_sat(in.y);
  }
-  if(C - 4 * in_c>=3){
+  if (C - 4 * in_c >= 3) {
    out[index + size_ch * 2] = convert_uchar_sat(in.z);
  }
-  if(C - 4 * in_c>=4){
+  if (C - 4 * in_c >= 4) {
    out[index + size_ch * 3] = convert_uchar_sat(in.w);
  }
 }
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -45,6 +45,9 @@ bool CLRuntime::Init() {
  bool is_device_init = InitializeDevice();
  is_init_success_ = is_platform_init && is_device_init;
  initialized_ = true;
+
+  context_ = CreateContext();
+  command_queue_ = CreateCommandQueue(context());
  return initialized_;
 }

@@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() {

 cl::Context& CLRuntime::context() {
  if (context_ == nullptr) {
-    context_ = CreateContext();
+    LOG(FATAL) << "context_ create failed. ";
  }
  return *context_;
 }
@@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() {

 cl::CommandQueue& CLRuntime::command_queue() {
  if (command_queue_ == nullptr) {
-    command_queue_ = CreateCommandQueue(context());
+    LOG(FATAL) << "command_queue_ create failed. ";
  }
  return *command_queue_;
 }
@@ -96,7 +99,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(

 bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
  /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/
-  std::string build_option = options + " -cl-fast-relaxed-math ";
+  std::string build_option = options + " -cl-fast-relaxed-math -cl-mad-enable";
  VLOG(4) << "OpenCL build_option: " << build_option;
  status_ = program->build({*device_}, build_option.c_str());
  CL_CHECK_ERROR(status_);

--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
@@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst,
                                 size_t size,
                                 IoDirection dir) {
  cl_int status;
-  cl::Event event;
  auto stream = CLRuntime::Global()->command_queue();
  switch (dir) {
    case IoDirection::DtoD:
@@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst,
                                        0,
                                        size,
                                        nullptr,
-                                        &event);
+                                        nullptr);
      CL_CHECK_FATAL(status);
-      event.wait();
+      CLRuntime::Global()->command_queue().finish();
      break;
    case IoDirection::HtoD:
      status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst),
@@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst,
  cl::array<size_t, 3> origin = {0, 0, 0};
  cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
  cl_int status;
-  cl::Event event;
  auto stream = CLRuntime::Global()->command_queue();
  switch (dir) {
    case IoDirection::DtoD:
@@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst,
                                       origin,
                                       region,
                                       nullptr,
-                                       &event);
+                                       nullptr);
      CL_CHECK_FATAL(status);
-      event.wait();
+      CLRuntime::Global()->command_queue().finish();
      break;
    case IoDirection::HtoD:
      status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst),

--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
@@ -40,7 +40,7 @@ void MatMulJitCode::genCode() {
  for (size_t g = 0; g < groups.size(); ++g) {
    size_t x_offset = 0;
    size_t wgt_offset_tmp = 0;
-    for (int i = 0; i < g; ++i) {
+    for (size_t i = 0; i < g; ++i) {
      wgt_offset_tmp += groups[i] * block_len;
    }
    for (int k = 0; k < k_; ++k) {

--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
@@ -28,6 +28,12 @@
 #define posix_memalign_free free
 #endif

+#ifdef _WIN32
+#define posix_memalign_free _aligned_free
+#define posix_memalign(p, a, s) \
+  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#endif
+
 // DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
 bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");

@@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
 void* GenBase::operator new(size_t size) {
  void* ptr;
  constexpr size_t alignment = 32ul;
+#ifdef _WIN32
+  ptr = _aligned_malloc(size, alignment);
+#else
  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
                    0,
                    "GenBase Alloc %ld error!",
                    size);
+#endif
  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
  return ptr;
 }

--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -265,7 +265,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
    // size_t num_seqs = scores->NumElements(lod_level);
    size_t num_seqs = scores->lod()[lod_level].size() - 1;
    size_t seq_width = 1;
-    for (int i = 1; i < scores->dims().size(); i++) {
+    for (size_t i = 1; i < scores->dims().size(); i++) {
      seq_width *= scores->dims()[i];
    }


--- a/lite/backends/x86/math/blas.cc
+++ b/lite/backends/x86/math/blas.cc
@@ -23,7 +23,7 @@ namespace math {
 MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
                                     int num_flatten_cols,
                                     bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
+  PADDLE_ENFORCE_GT(tensor_dim.size(), 1u);
  MatDescriptor retv;
  if (num_flatten_cols > 1) {
    auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);

--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
@@ -46,9 +46,9 @@ class MaxSeqPoolFunctor {
    auto in_dims = input.dims();
    auto out_dims = output->dims();
    auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
+    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
+    for (size_t i = 1; i < in_dims.size(); ++i) {
      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
    }
    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
@@ -95,9 +95,9 @@ class MaxSeqPoolFunctor<T, true> {
                  lite::Tensor* index) {
    auto in_dims = input.dims();
    auto out_dims = output->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
+    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
+    for (size_t i = 1; i < in_dims.size(); ++i) {
      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
    }

@@ -138,7 +138,7 @@ class MaxSeqPoolGradFunctor {
    auto idx_dims = index.dims();
    PADDLE_ENFORCE_GT(og_dims.size(), 1);
    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
+    for (size_t i = 1; i < og_dims.size(); ++i) {
      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
    }
    PADDLE_ENFORCE_EQ(idx_dims, og_dims);

--- a/lite/backends/x86/parallel.h
+++ b/lite/backends/x86/parallel.h
@@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() {
  // Do not support nested omp parallem.
  num_threads = omp_in_parallel() ? 1 : omp_get_max_threads();
 #endif
-  return std::max(num_threads, 1L);
+  return std::max<int>(num_threads, 1L);
 }

 using ThreadHandler =

--- a/lite/backends/x86/port.h
+++ b/lite/backends/x86/port.h
@@ -14,10 +14,10 @@

 #pragma once

+#include <time.h>
 #include <cstdio>
 #include <stdexcept>

-#include <time.h>
 #include <memory>
 #include <string>

@@ -37,7 +37,9 @@
 #define GOOGLE_GLOG_DLL_DECL
 #include <io.h>  // _popen, _pclose
 #include <stdio.h>
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
 #include <windows.h>
+#include <winsock.h>
 #include <numeric>  // std::accumulate in msvc
 #ifndef S_ISDIR     // windows port for sys/stat.h
 #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
@@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) {
  return reinterpret_cast<void *>(hModule);
 }

+extern struct timeval;
 static int gettimeofday(struct timeval *tp, void *tzp) {
  time_t clock;
  struct tm tm;

--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -24,13 +24,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
    proto_library(framework_proto SRCS framework.proto)
 endif()

-if (LITE_WITH_X86)
 lite_cc_library(variable SRCS variable.cc DEPS tensor)
 lite_cc_library(types SRCS types.cc)
-else()
-lite_cc_library(variable SRCS variable.cc DEPS tensor)
-lite_cc_library(types SRCS types.cc)
-endif()
 lite_cc_library(op_registry SRCS op_registry.cc DEPS kernel)
 lite_cc_library(scope SRCS scope.cc DEPS tensor)
 lite_cc_library(device_info SRCS device_info.cc DEPS tensor)

--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -107,7 +107,7 @@ void TestCase::PrepareInputsForInstruction() {
          CHECK(!shared_tensor_array->empty())
              << "shared_tensor_array is empty yet";
          target_tensor_array->resize(shared_tensor_array->size());
-          for (int i = 0; i < shared_tensor_array->size(); i++) {
+          for (size_t i = 0; i < shared_tensor_array->size(); i++) {
            target_tensor_array->at(i).Resize(
                shared_tensor_array->at(i).dims());
            TargetCopy(param_type->type->target(),
@@ -219,7 +219,7 @@ bool TestCase::CheckPrecision(const std::string& var_name,
    auto b_tensor_array =
        base_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>();
    CHECK_EQ(a_tensor_array->size(), b_tensor_array->size());
-    for (int i = 0; i < a_tensor_array->size(); i++) {
+    for (size_t i = 0; i < a_tensor_array->size(); i++) {
      Tensor* a_tensor = &(a_tensor_array->at(i));
      Tensor* b_tensor = &(b_tensor_array->at(i));
      if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) {

--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -166,7 +166,7 @@ class TestCase {
  // TODO(Superjomn) Move this method to utils or DDim?
  bool ShapeEquals(const DDim& a, const DDim& b) {
    if (a.size() != b.size()) return false;
-    for (int i = 0; i < a.size(); i++) {
+    for (size_t i = 0; i < a.size(); i++) {
      if (a[i] != b[i]) return false;
    }
    return true;

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -54,6 +54,7 @@ using HostContext = Context<TargetType::kHost>;
 using X86Context = Context<TargetType::kX86>;
 using ARMContext = Context<TargetType::kARM>;
 using NPUContext = Context<TargetType::kNPU>;
+using APUContext = Context<TargetType::kAPU>;
 using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
@@ -87,6 +88,21 @@ class Context<TargetType::kNPU> {
 };
 #endif

+#ifdef LITE_WITH_APU
+template <>
+class Context<TargetType::kAPU> {
+ public:
+  Context() {}
+  explicit Context(const APUContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(APUContext* ctx) {}
+
+  APUContext& operator=(const APUContext& ctx) {}
+  std::string name() const { return "APUContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_BM
 template <>
 class Context<TargetType::kBM> {
@@ -324,27 +340,17 @@ class Context<TargetType::kX86> {
 template <>
 class Context<TargetType::kOpenCL> {
  std::shared_ptr<CLContext> cl_context_;
-  using WaitListType =
-      std::unordered_map<decltype(static_cast<const void*>(nullptr)),
-                         std::shared_ptr<cl::Event>>;
-  std::shared_ptr<WaitListType> cl_wait_list_;

 public:
  CLContext* cl_context() { return cl_context_.get(); }
-  WaitListType* cl_wait_list() { return cl_wait_list_.get(); }

  void InitOnce() {
    // Init cl runtime.
    CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed";
-
    cl_context_ = std::make_shared<CLContext>();
-    cl_wait_list_ = std::make_shared<WaitListType>();
  }

-  void CopySharedTo(OpenCLContext* ctx) {
-    ctx->cl_context_ = cl_context_;
-    ctx->cl_wait_list_ = cl_wait_list_;
-  }
+  void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; }
 };
 #endif

@@ -408,6 +414,12 @@ class ContextScheduler {
            &ctx->As<NPUContext>());
        break;
 #endif
+#ifdef LITE_WITH_APU
+      case TARGET(kAPU):
+        kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
+            &ctx->As<APUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_RKNPU
      case TARGET(kRKNPU):
        kernel_contexts_[TargetType::kRKNPU].As<RKNPUContext>().CopySharedTo(
@@ -483,6 +495,9 @@ class ContextScheduler {
 #ifdef LITE_WITH_NPU
    InitContext<TargetType::kNPU, NPUContext>();
 #endif
+#ifdef LITE_WITH_APU
+    InitContext<TargetType::kAPU, APUContext>();
+#endif
 #ifdef LITE_WITH_RKNPU
    InitContext<TargetType::kRKNPU, RKNPUContext>();
 #endif

--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -947,7 +947,7 @@ void DeviceInfo::RequestPowerNoBindMode(int thread_num) {
    active_ids_ = core_ids_;
  } else {
    active_ids_.resize(thread_num);
-    for (int i = 0; i < thread_num; ++i) {
+    for (uint32_t i = 0; i < thread_num; ++i) {
      if (i < big_core_ids_.size()) {
        active_ids_[i] = big_core_ids_[i];
      } else {

--- a/lite/core/kernel.cc
+++ b/lite/core/kernel.cc
@@ -57,7 +57,7 @@ void KernelBase::ParseKernelType(const std::string &kernel_type,
                                 std::string *alias,
                                 Place *place) {
  auto parts = Split(kernel_type, "/");
-  CHECK_EQ(parts.size(), 5);
+  CHECK_EQ(parts.size(), 5u);
  *op_type = parts[0];
  *alias = parts[1];


--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -163,23 +163,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
-      for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
-        for (unsigned int i = 0; i < h; ++i) {
+      for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+        for (int i = 0; i < h; ++i) {
          weight_scale[i] *= fabsf(alpha_data[i]);
          if (alpha_data[i] < 0.f) {
            auto ptr_row = conv_weight_d + k * c_size + i * hw;
-            for (unsigned int j = 0; j < hw; ++j) {
+            for (int j = 0; j < hw; ++j) {
              ptr_row[j] *= -1;
            }
          }
        }
      }
    } else {
-      for (unsigned int i = 0; i < h; ++i) {
+      for (int i = 0; i < h; ++i) {
        weight_scale[i] *= fabsf(alpha_data[i]);
        if (alpha_data[i] < 0.f) {
          auto ptr_row = conv_weight_d + i * w;
-          for (unsigned int j = 0; j < w; ++j) {
+          for (int j = 0; j < w; ++j) {
            ptr_row[j] *= -1;
          }
        }
@@ -203,17 +203,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
-      for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
-        for (unsigned int i = 0; i < h; ++i) {
+      for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+        for (int i = 0; i < h; ++i) {
          auto ptr_row = conv_weight_d + k * c_size + i * hw;
-          for (unsigned int j = 0; j < hw; ++j) {
+          for (int j = 0; j < hw; ++j) {
            ptr_row[j] *= alpha_data[i];
          }
        }
      }
    } else {
-      for (unsigned int i = 0; i < h; ++i) {    // n: conv2d output channels
-        for (unsigned int j = 0; j < w; ++j) {  // w: conv2d input channels
+      for (int i = 0; i < h; ++i) {    // n: conv2d output channels
+        for (int j = 0; j < w; ++j) {  // w: conv2d input channels
          conv_weight_d[i * w + j] *= alpha_data[i];
        }
      }

--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -260,7 +260,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
  auto channel_scale_tensor =
      scope->FindVar(channel_scale_name)->GetMutable<lite::Tensor>();
  auto* channel_scale_data = channel_scale_tensor->data<float>();
-  for (int i = 0; i < channel_scale_tensor->data_size(); i++) {
+  for (size_t i = 0; i < channel_scale_tensor->data_size(); i++) {
    weight_scale.push_back(channel_scale_data[i] / range);
  }


--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -313,4 +313,8 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
    .BindTargets({TARGET(kARM), TARGET(kOpenCL)})
-    .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM), TARGET(kRKNPU)});
+    .ExcludeTargets({TARGET(kNPU),
+                     TARGET(kXPU),
+                     TARGET(kBM),
+                     TARGET(kRKNPU),
+                     TARGET(kAPU)});
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -292,7 +292,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,

  // get subgraph op's type info
  size_t kernel_size = inst_node->AsStmt().kernels().size();
-  CHECK_GT(kernel_size, 0);
+  CHECK_GT(kernel_size, 0u);
  VLOG(4) << "subgraph kernel size: " << kernel_size;

  for (size_t i = 0; i < kernel_size; ++i) {
@@ -450,7 +450,7 @@ bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
  auto* block_desc =
      static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
          ->GetSubBlock();
-  for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
+  for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
    CHECK(op_desc);
    if (op_desc->Type() == "conv2d") {

--- a/lite/core/mir/pass_registry.h
+++ b/lite/core/mir/pass_registry.h
@@ -59,6 +59,9 @@ class PassRegistry {
 }  // namespace lite
 }  // namespace paddle

+// some platform-independent defintion
+#include "lite/utils/macros.h"
+
 #define REGISTER_MIR_PASS(name__, class__)                                \
  paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__,      \
                                                            new class__); \
@@ -66,4 +69,4 @@ class PassRegistry {
    return mir_pass_registry##name__.Touch();                             \
  }                                                                       \
  static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__  \
-      __attribute__((unused)) = mir_pass_registry##name__
+      UNUSED = mir_pass_registry##name__
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply(

 REGISTER_MIR_PASS(quantized_op_attributes_inference_pass,
                  paddle::lite::mir::QuantizedOpAttributesInferencePass)
-    .BindTargets({TARGET(kNPU), TARGET(kRKNPU)});
+    .BindTargets({TARGET(kAPU), TARGET(kRKNPU)});
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -47,8 +47,8 @@ std::string SubgraphVisualizer::operator()() {
      "turquoise4",   "snow3",          "sienna4",        "salmon2",
  };
  std::unordered_map<Node *, int> subgraph_indices;
-  for (int i = 0; i < subgraphs_.size(); i++) {
-    for (int j = 0; j < subgraphs_[i].size(); j++) {
+  for (size_t i = 0; i < subgraphs_.size(); i++) {
+    for (size_t j = 0; j < subgraphs_[i].size(); j++) {
      subgraph_indices[subgraphs_[i][j]] = i;
    }
  }
@@ -538,7 +538,8 @@ void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph,
  std::vector<std::vector<Node *>> subgraphs =
      SubgraphDetector(graph, teller)();
  SubgraphVisualizer(graph, subgraphs)();
-  for (int subgraph_idx = 0; subgraph_idx < subgraphs.size(); subgraph_idx++) {
+  for (size_t subgraph_idx = 0; subgraph_idx < subgraphs.size();
+       subgraph_idx++) {
    if (subgraphs[subgraph_idx].size() >= min_subgraph_size) {
      InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]);
    }

--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -36,8 +36,8 @@ std::vector<std::string> AddFCDesc(
    const std::shared_ptr<Scope>& scope,
    const std::vector<std::string>& input_var_names,
    const std::vector<int64_t>& wshape) {
-  CHECK_EQ(input_var_names.size(), 1);
-  CHECK_EQ(wshape.size(), 2);
+  CHECK_EQ(input_var_names.size(), 1u);
+  CHECK_EQ(wshape.size(), 2u);
  static int id = 0;
  std::string prefix = "fc_" + paddle::lite::to_string(id);
  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
@@ -169,8 +169,8 @@ TEST(Subgraph, detect_simple_model) {
  };
  std::vector<std::vector<mir::Node*>> subgraphs =
      mir::SubgraphDetector(graph.get(), teller)();
-  ASSERT_EQ(subgraphs.size(), 1);
-  ASSERT_EQ(graph->nodes().size(), 9);
+  ASSERT_EQ(subgraphs.size(), 1u);
+  ASSERT_EQ(graph->nodes().size(), 9u);
  mir::SubgraphVisualizer(graph.get(), subgraphs)();
 }

@@ -221,7 +221,7 @@ TEST(Subgraph, detect_custom_model) {
  std::vector<std::vector<mir::Node*>> subgraphs =
      mir::SubgraphDetector(graph.get(), teller)();
  mir::SubgraphVisualizer(graph.get(), subgraphs)();
-  ASSERT_EQ(subgraphs.size(), 1);
+  ASSERT_EQ(subgraphs.size(), 1u);
 }

 }  // namespace lite

--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -40,6 +40,22 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }

+void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) \
+  supported_lists.insert(#op_type);          \
+  LOG(INFO) << #op_type
+#include "lite/kernels/apu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
  std::unordered_set<std::string> supported_lists;
@@ -103,6 +119,8 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
    .BindTargets({TARGET(kNPU)});
+REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass)
+    .BindTargets({TARGET(kAPU)});
 REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
    .BindTargets({TARGET(kXPU)});
 REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)

--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };

+class APUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 class XPUSubgraphPass : public ProgramPass {
 public:
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;

--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -39,7 +39,7 @@ std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
  std::vector<std::vector<int64_t>> shapes;
  std::vector<std::string> shape_strings = Split(text, ":");
  shapes.resize(shape_strings.size());
-  for (int i = 0; i < shape_strings.size(); i++) {
+  for (size_t i = 0; i < shape_strings.size(); i++) {
    std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
    for (auto shape_num : shape_nums) {
      shapes[i].push_back(atoi(shape_num.c_str()));
@@ -66,7 +66,7 @@ void FillInputTensors(
  for (int j = 0; j < input_tensor_size; j++) {                \
    input_tensor_data[j] = static_cast<type>(value);           \
  }
-  for (int i = 0; i < input_tensor_shape.size(); i++) {
+  for (size_t i = 0; i < input_tensor_shape.size(); i++) {
    auto input_tensor = predictor->GetInput(i);
    input_tensor->Resize(input_tensor_shape[i]);
    auto input_tensor_size = ShapeProduction(input_tensor->shape());
@@ -95,7 +95,7 @@ void CheckOutputTensors(
            << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff;        \
    EXPECT_LT(rel_diff, 0.1);                                                 \
  }
-  for (int i = 0; i < output_tensor_type.size(); i++) {
+  for (size_t i = 0; i < output_tensor_type.size(); i++) {
    auto tar_output_tensor = tar_predictor->GetOutput(i);
    auto ref_output_tensor = ref_predictor->GetOutput(i);
    auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());

--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -25,23 +25,23 @@ namespace lite {
 bool OpLite::InferShape() {
  // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_
  // InferShapeByMemoryInternal will be applied.
-  if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) {
+  if (op_param_ && op_param_->input_tensor_ptrs() &&
+      op_param_->output_tensor_ptrs()) {
    return this->InferShapeWithCache();
  } else {
-    // otherwise, InferShapeImpl is applied directly.
    return this->InferShapeImpl();
  }
 }
 bool OpLite::InferShapeWithCache() {
  // 1. Get vector of current input tensors
-  auto *current_inputs = param_.input_tensor_ptrs();
+  auto *current_inputs = op_param_->input_tensor_ptrs();
  // 2. Get hash value of current inputs shape and lod
  size_t new_hash = 0;
  for (auto iter = current_inputs->begin(); iter != current_inputs->end();
       iter++) {
    // combined dims value into new_hash value.
    auto &element_dims = (*iter)->dims();
-    for (int i = 0; i < element_dims.size(); i++) {
+    for (size_t i = 0; i < element_dims.size(); i++) {
      new_hash =
          lite::hash_combine(new_hash, static_cast<int>(element_dims[i]));
    }
@@ -49,7 +49,7 @@ bool OpLite::InferShapeWithCache() {
    auto &emement_lods = (*iter)->lod();
    for (auto lod_iter = emement_lods.begin(); lod_iter != emement_lods.end();
         lod_iter++) {
-      for (int i = 0; i < lod_iter->size(); i++) {
+      for (size_t i = 0; i < lod_iter->size(); i++) {
        new_hash =
            lite::hash_combine(new_hash, static_cast<int>(lod_iter->at(i)));
      }
@@ -59,8 +59,8 @@ bool OpLite::InferShapeWithCache() {
  if (new_hash == io_shape_lod_hash_ && new_hash != 0) {
    // if current hash value is consistent with io_shape_lod_hash_,
    // previous outputs shape and lod are reused.
-    auto *current_outputs = param_.output_tensor_ptrs();
-    for (int i = 0; i < current_outputs->size(); i++) {
+    auto *current_outputs = op_param_->output_tensor_ptrs();
+    for (size_t i = 0; i < current_outputs->size(); i++) {
      current_outputs->at(i)->Resize(last_output_shapes[i]);
      current_outputs->at(i)->set_lod(last_output_lods[i]);
    }
@@ -68,10 +68,12 @@ bool OpLite::InferShapeWithCache() {
    // otherwise, current hash value is changed, InferShapeImpl will apply.
    io_shape_lod_hash_ = new_hash;
    this->InferShapeImpl();
-    auto *current_outputs = param_.output_tensor_ptrs();
-    for (int i = 0; i < current_outputs->size(); i++) {
-      last_output_shapes[i] = current_outputs->at(i)->dims();
-      last_output_lods[i] = current_outputs->at(i)->lod();
+    auto *current_outputs = op_param_->output_tensor_ptrs();
+    last_output_shapes.clear();
+    last_output_lods.clear();
+    for (size_t i = 0; i < current_outputs->size(); i++) {
+      last_output_shapes.push_back(current_outputs->at(i)->dims());
+      last_output_lods.push_back(current_outputs->at(i)->lod());
    }
  }
  return true;

--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -77,6 +77,11 @@ class OpLite : public Registry {
  // Link the external execution environ to internal context.
  bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);

+  template <typename T>
+  inline void AttachParam(T *param) {
+    op_param_ = static_cast<T *>(param);
+  }
+
  const OpInfo *op_info() const { return op_info_.get(); }
  OpInfo *mutable_op_info() { return op_info_.get(); }

@@ -167,11 +172,10 @@ class OpLite : public Registry {
  std::vector<Place> valid_places_;
  Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
  std::unique_ptr<OpInfo> op_info_;
-
  std::vector<DDimLite> last_output_shapes{};
  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
  size_t io_shape_lod_hash_{};
-  mutable operators::ParamBase param_;
+  mutable operators::ParamBase *op_param_{nullptr};

 private:
  // Infer Shape according to memory, if current input shapes are consistent

--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -98,6 +98,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
    case TARGET(kNPU): {
      CREATE_KERNEL(kNPU);
    } break;
+    case TARGET(kAPU): {
+      CREATE_KERNEL(kAPU);
+    } break;
    case TARGET(kXPU): {
      CREATE_KERNEL(kXPU);
    } break;
@@ -220,6 +223,7 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kNPU, kAny, kNHWC);
  INIT_FOR(kNPU, kAny, kAny);

+  INIT_FOR(kAPU, kInt8, kNCHW);
  INIT_FOR(kXPU, kFloat, kNCHW);
  INIT_FOR(kXPU, kInt8, kNCHW);
  INIT_FOR(kXPU, kAny, kNCHW);

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -111,18 +111,23 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kCUDA),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNHWC)> *,  //
+
              KernelRegistryForTarget<TARGET(kX86),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kX86),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //
+
              KernelRegistryForTarget<TARGET(kHost),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //
@@ -141,9 +146,7 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kHost),
                                      PRECISION(kInt64),
                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
+
              KernelRegistryForTarget<TARGET(kARM),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //
@@ -231,6 +234,9 @@ class KernelRegistry final {
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //

+              KernelRegistryForTarget<TARGET(kAPU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kXPU),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //
@@ -445,32 +451,31 @@ class KernelRegistor : public lite::Registor<KernelType> {
 #define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \
  LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__)

-#define REGISTER_LITE_KERNEL(                                                  \
-    op_type__, target__, precision__, layout__, KernelClass, alias__)          \
-  static paddle::lite::KernelRegistor<TARGET(target__),                        \
-                                      PRECISION(precision__),                  \
-                                      DATALAYOUT(layout__),                    \
-                                      KernelClass>                             \
-      LITE_KERNEL_REGISTER_INSTANCE(                                           \
-          op_type__, target__, precision__, layout__, alias__)(#op_type__,     \
-                                                               #alias__);      \
-  static KernelClass LITE_KERNEL_INSTANCE(                                     \
-      op_type__, target__, precision__, layout__, alias__);                    \
-  int touch_##op_type__##target__##precision__##layout__##alias__() {          \
-    OpKernelInfoCollector::Global().AddKernel2path(                            \
-        #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__,  \
-        __FILE__);                                                             \
-    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__)  \
-        .Touch();                                                              \
-    return 0;                                                                  \
-  }                                                                            \
-  static bool LITE_KERNEL_PARAM_INSTANCE(                                      \
-      op_type__, target__, precision__, layout__, alias__)                     \
-      __attribute__((unused)) =                                                \
-          paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),       \
-                                                       PRECISION(precision__), \
-                                                       DATALAYOUT(layout__)>(  \
-              #op_type__ "/" #alias__)
+#define REGISTER_LITE_KERNEL(                                                 \
+    op_type__, target__, precision__, layout__, KernelClass, alias__)         \
+  static paddle::lite::KernelRegistor<TARGET(target__),                       \
+                                      PRECISION(precision__),                 \
+                                      DATALAYOUT(layout__),                   \
+                                      KernelClass>                            \
+      LITE_KERNEL_REGISTER_INSTANCE(                                          \
+          op_type__, target__, precision__, layout__, alias__)(#op_type__,    \
+                                                               #alias__);     \
+  static KernelClass LITE_KERNEL_INSTANCE(                                    \
+      op_type__, target__, precision__, layout__, alias__);                   \
+  int touch_##op_type__##target__##precision__##layout__##alias__() {         \
+    OpKernelInfoCollector::Global().AddKernel2path(                           \
+        #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \
+        __FILE__);                                                            \
+    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \
+        .Touch();                                                             \
+    return 0;                                                                 \
+  }                                                                           \
+  static bool LITE_KERNEL_PARAM_INSTANCE(                                     \
+      op_type__, target__, precision__, layout__, alias__) UNUSED =           \
+      paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),          \
+                                                   PRECISION(precision__),    \
+                                                   DATALAYOUT(layout__)>(     \
+          #op_type__ "/" #alias__)

 #define LITE_KERNEL_INSTANCE(                            \
    op_type__, target__, precision__, layout__, alias__) \

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -101,6 +101,7 @@ class Optimizer {
           "npu_subgraph_pass",
           "xpu_subgraph_pass",
           "bm_subgraph_pass",
+           "apu_subgraph_pass",
           "rknpu_subgraph_pass",
           "static_kernel_pick_pass",        // pick original kernel from graph
           "variable_place_inference_pass",  // inference arg/var's

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -72,7 +72,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
  std::unordered_map<std::string, cpp::VarDesc> origin_var_maps;
  auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
  auto var_size = main_block.VarsSize();
-  for (int i = 0; i < var_size; i++) {
+  for (size_t i = 0; i < var_size; i++) {
    auto v = main_block.GetVar<cpp::VarDesc>(i);
    auto name = v->Name();
    origin_var_maps.emplace(name, *v);

--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -100,7 +100,7 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {

 void TensorLite::ResetBuffer(std::shared_ptr<Buffer> buffer,
                             size_t memory_size) {
-  CHECK_EQ(offset_, 0)
+  CHECK_EQ(offset_, 0u)
      << "Only the offset is supported to zero when the Buffer is reset.";
  if (buffer_) {
    CHECK_LE(memory_size_, buffer->space())

--- a/lite/core/types.h
+++ b/lite/core/types.h
@@ -30,7 +30,7 @@ namespace core {
 // TODO(Superjomn) unify all the type representation across the lite framework.
 enum class Type {
  UNK = -1,
-  // primary types
+  // primary typesINT32,
  INT32,
  INT64,
  FLOAT32,
@@ -92,6 +92,8 @@ Type StdTypeToRepr<float>();
 template <>
 Type StdTypeToRepr<bool>();
 template <>
+Type StdTypeToRepr<double>();
+template <>
 Type StdTypeToRepr<std::vector<char>>();
 template <>
 Type StdTypeToRepr<std::string>();

--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -18,6 +18,11 @@
 #include "paddle_api.h"         // NOLINT
 #include "paddle_use_passes.h"  // NOLINT

+#if defined(_WIN32)
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#endif
+
 using namespace paddle::lite_api;  // NOLINT

 DEFINE_string(model_dir, "", "Model dir path.");

--- a/lite/demo/python/mobilenetv1_full_api.py
+++ b/lite/demo/python/mobilenetv1_full_api.py
@@ -23,7 +23,7 @@ import argparse
 import sys
 sys.path.append('../../python/lib')

-from lite_core import *
+from paddlelite.lite import *

 # Command arguments
 parser = argparse.ArgumentParser()

--- a/lite/demo/python/mobilenetv1_light_api.py
+++ b/lite/demo/python/mobilenetv1_light_api.py
@@ -23,7 +23,7 @@ import argparse
 import sys
 sys.path.append('../../python/lib')

-from lite_core import *
+from paddlelite.lite import *

 # Command arguments
 parser = argparse.ArgumentParser()

--- a/lite/fluid/data_type.cc
+++ b/lite/fluid/data_type.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "lite/fluid/data_type.h"
 #include <stdint.h>
 #include <string>

--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -11,5 +11,6 @@ add_subdirectory(fpga)
 add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
+add_subdirectory(apu)
 add_subdirectory(bm)
 add_subdirectory(rknpu)
--- a/lite/kernels/apu/CMakeLists.txt
+++ b/lite/kernels/apu/CMakeLists.txt
+add_subdirectory(bridges)
+
+add_kernel(subgraph_compute_apu APU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_apu subgraph_bridge_engine ${apu_subgraph_bridges})
--- a/lite/kernels/apu/bridges/CMakeLists.txt
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+
+
+lite_cc_library(subgraph_bridge_utility_apu SRCS utility.cc DEPS tensor)
+lite_cc_library(subgraph_bridge_graph_apu SRCS graph.cc DEPS subgraph_bridge_utility_apu)
+
+set(apu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_apu subgraph_bridge_graph_apu)
+
+lite_cc_library(subgraph_bridge_conv_op_apu SRCS conv_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_apu SRCS elementwise_ops.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
+
+
+set(apu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_apu
+        subgraph_bridge_conv_op_apu
+        subgraph_bridge_elementwise_ops_apu
+        subgraph_bridge_act_op_apu
+        subgraph_bridge_softmax_op_apu
+        subgraph_bridge_fc_op_apu
+        subgraph_bridge_pool_op_apu
+        CACHE INTERNAL "apu_subgraph_bridges")
+
+message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
--- a/lite/kernels/apu/bridges/act_op.cc
+++ b/lite/kernels/apu/bridges/act_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+
+  return SUCCESS;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(relu, kAPU, paddle::lite::subgraph::apu::ActConverter);
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
--- a/lite/kernels/apu/bridges/elementwise_ops.cc
+++ b/lite/kernels/apu/bridges/elementwise_ops.cc
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
--- a/lite/kernels/arm/assign_compute.cc
+++ b/lite/kernels/arm/assign_compute.cc
--- a/lite/kernels/apu/bridges/graph.h
+++ b/lite/kernels/apu/bridges/graph.h
--- a/lite/kernels/apu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/apu/bridges/paddle_use_bridges.h
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
--- a/lite/kernels/apu/bridges/utility.cc
+++ b/lite/kernels/apu/bridges/utility.cc
--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
--- a/lite/kernels/arm/logical_compute.h
+++ b/lite/kernels/arm/logical_compute.h
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/beam_search_decode_compute.cc
+++ b/lite/kernels/arm/beam_search_decode_compute.cc
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
--- a/lite/kernels/arm/while_compute.cc
+++ b/lite/kernels/arm/while_compute.cc
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/assign_compute.cc
+++ b/lite/kernels/host/assign_compute.cc
--- a/lite/kernels/arm/assign_compute.h
+++ b/lite/kernels/arm/assign_compute.h
--- a/lite/kernels/arm/is_empty_compute.cc
+++ b/lite/kernels/arm/is_empty_compute.cc
--- a/lite/kernels/arm/is_empty_compute.h
+++ b/lite/kernels/arm/is_empty_compute.h
--- a/lite/kernels/arm/logical_compute.cc
+++ b/lite/kernels/arm/logical_compute.cc
--- a/lite/kernels/host/logical_compute.h
+++ b/lite/kernels/host/logical_compute.h
--- a/lite/kernels/arm/read_from_array_compute.cc
+++ b/lite/kernels/arm/read_from_array_compute.cc
--- a/lite/kernels/arm/read_from_array_compute.h
+++ b/lite/kernels/arm/read_from_array_compute.h
--- a/lite/kernels/arm/write_to_array_compute.cc
+++ b/lite/kernels/arm/write_to_array_compute.cc
--- a/lite/kernels/arm/write_to_array_compute.h
+++ b/lite/kernels/arm/write_to_array_compute.h
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
--- a/lite/kernels/mlu/bridges/concat_op_test.cc
+++ b/lite/kernels/mlu/bridges/concat_op_test.cc
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
--- a/lite/kernels/mlu/bridges/interpolate_op_test.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc
--- a/lite/kernels/mlu/bridges/utility.cc
+++ b/lite/kernels/mlu/bridges/utility.cc
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
--- a/lite/kernels/opencl/activation_buffer_compute_test.cc
+++ b/lite/kernels/opencl/activation_buffer_compute_test.cc
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
--- a/lite/kernels/opencl/activation_image_compute_test.cc
+++ b/lite/kernels/opencl/activation_image_compute_test.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
--- a/lite/kernels/opencl/box_coder_image_compute_test.cc
+++ b/lite/kernels/opencl/box_coder_image_compute_test.cc
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
--- a/lite/kernels/opencl/concat_buffer_compute_test.cc
+++ b/lite/kernels/opencl/concat_buffer_compute_test.cc
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
--- a/lite/kernels/opencl/concat_image_compute_test.cc
+++ b/lite/kernels/opencl/concat_image_compute_test.cc
--- a/lite/kernels/opencl/conv_buffer_compute.cc
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
--- a/lite/kernels/opencl/conv_buffer_compute_test.cc
+++ b/lite/kernels/opencl/conv_buffer_compute_test.cc
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
--- a/lite/kernels/opencl/conv_image_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
--- a/lite/kernels/opencl/dropout_image_compute_test.cc
+++ b/lite/kernels/opencl/dropout_image_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
--- a/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute_test.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute_test.cc
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
--- a/lite/kernels/opencl/io_copy_buffer_compute_test.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute_test.cc
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
--- a/lite/kernels/opencl/layout_image_compute_test.cc
+++ b/lite/kernels/opencl/layout_image_compute_test.cc
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
--- a/lite/kernels/opencl/lrn_image_compute_test.cc
+++ b/lite/kernels/opencl/lrn_image_compute_test.cc
--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
--- a/lite/kernels/opencl/mul_buffer_compute_test.cc
+++ b/lite/kernels/opencl/mul_buffer_compute_test.cc
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
--- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
--- a/lite/kernels/opencl/pad2d_image_compute_test.cc
+++ b/lite/kernels/opencl/pad2d_image_compute_test.cc
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
--- a/lite/kernels/opencl/pool_buffer_compute_test.cc
+++ b/lite/kernels/opencl/pool_buffer_compute_test.cc
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
--- a/lite/kernels/opencl/pool_image_compute_test.cc
+++ b/lite/kernels/opencl/pool_image_compute_test.cc
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
--- a/lite/kernels/opencl/reshape_image_compute_test.cc
+++ b/lite/kernels/opencl/reshape_image_compute_test.cc
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
--- a/lite/kernels/opencl/scale_image_compute_test.cc
+++ b/lite/kernels/opencl/scale_image_compute_test.cc
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
--- a/lite/kernels/opencl/slice_image_compute_test.cc
+++ b/lite/kernels/opencl/slice_image_compute_test.cc
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
--- a/lite/kernels/x86/gather_compute.h
+++ b/lite/kernels/x86/gather_compute.h
--- a/lite/kernels/x86/layer_norm_compute_test.cc
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
--- a/lite/kernels/x86/sequence_expand_as_compute.h
+++ b/lite/kernels/x86/sequence_expand_as_compute.h
--- a/lite/kernels/x86/sequence_reverse_compute_test.cc
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
--- a/lite/kernels/x86/shape_compute.h
+++ b/lite/kernels/x86/shape_compute.h
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
--- a/lite/kernels/x86/slice_compute_test.cc
+++ b/lite/kernels/x86/slice_compute_test.cc
--- a/lite/kernels/x86/stack_compute.h
+++ b/lite/kernels/x86/stack_compute.h
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/model_parser/model_parser_test.cc
+++ b/lite/model_parser/model_parser_test.cc
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
--- a/lite/operators/batch_norm_op.cc
+++ b/lite/operators/batch_norm_op.cc
--- a/lite/operators/concat_op.cc
+++ b/lite/operators/concat_op.cc
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
--- a/lite/operators/expand_op.cc
+++ b/lite/operators/expand_op.cc
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/fill_constant_batch_size_like_op.cc
+++ b/lite/operators/fill_constant_batch_size_like_op.cc
--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
--- a/lite/operators/flatten_op.cc
+++ b/lite/operators/flatten_op.cc
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
--- a/lite/operators/is_empty_op.cc
+++ b/lite/operators/is_empty_op.cc
--- a/lite/operators/matmul_op.cc
+++ b/lite/operators/matmul_op.cc
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
--- a/lite/operators/read_from_array_op.cc
+++ b/lite/operators/read_from_array_op.cc
--- a/lite/operators/reduce_mean_op.cc
+++ b/lite/operators/reduce_mean_op.cc
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
--- a/lite/operators/scale_op.cc
+++ b/lite/operators/scale_op.cc
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
--- a/lite/operators/sequence_softmax_op.cc
+++ b/lite/operators/sequence_softmax_op.cc
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
--- a/lite/operators/squeeze_op.cc
+++ b/lite/operators/squeeze_op.cc
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
--- a/lite/operators/write_to_array_op.cc
+++ b/lite/operators/write_to_array_op.cc
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
--- a/lite/tests/api/test_mobilenetv1_int8_apu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_apu.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/assign_compute_test.cc
+++ b/lite/tests/kernels/assign_compute_test.cc
--- a/lite/tests/kernels/compare_compute_test.cc
+++ b/lite/tests/kernels/compare_compute_test.cc
--- a/lite/tests/kernels/is_empty_compute_test.cc
+++ b/lite/tests/kernels/is_empty_compute_test.cc
--- a/lite/tests/kernels/logical_compute_test.cc
+++ b/lite/tests/kernels/logical_compute_test.cc
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
--- a/lite/tests/kernels/read_from_array_compute_test.cc
+++ b/lite/tests/kernels/read_from_array_compute_test.cc
--- a/lite/tests/kernels/topk_compute_test.cc
+++ b/lite/tests/kernels/topk_compute_test.cc
--- a/lite/tests/kernels/write_to_array_compute_test.cc
+++ b/lite/tests/kernels/write_to_array_compute_test.cc
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/lite/utils/string.h
+++ b/lite/utils/string.h
--- a/lite/utils/variant.h
+++ b/lite/utils/variant.h
--- a/mobile/src/framework/loader.cpp
+++ b/mobile/src/framework/loader.cpp
--- a/mobile/src/operators/bilinear_interp_op.cpp
+++ b/mobile/src/operators/bilinear_interp_op.cpp
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
--- a/mobile/test/net/test_inference_m2fm.cpp
+++ b/mobile/test/net/test_inference_m2fm.cpp
--- a/mobile/test/net/test_mobilenet_GPU.cpp
+++ b/mobile/test/net/test_mobilenet_GPU.cpp