[APU] Add MTK APU backend (#3407)

09eea7a2 · hong19860320 · GitHub · b496e7f4 · 09eea7a2 · 09eea7a2
44 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
+lite_option(LITE_WITH_APU  "Enable APU in lite mode"  OFF)
 lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
@@ -173,6 +174,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    include(cross_compiling/postproject)
    include(device/npu) # check and prepare NPU DDK
    include(device/xpu) # check and prepare XPU SDK
+    include(device/apu) # check and prepare APU SDK

    # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON
    # So the following third party dependencies are not needed.

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -143,6 +143,10 @@ if (LITE_WITH_NPU)
    add_definitions("-DLITE_WITH_NPU")
 endif()

+if (LITE_WITH_APU)
+    add_definitions("-DLITE_WITH_APU")
+endif()
+
 if (LITE_WITH_RKNPU)
    add_definitions("-DLITE_WITH_RKNPU")
 endif()

--- a/cmake/device/apu.cmake
+++ b/cmake/device/apu.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+
+if(NOT DEFINED APU_DDK_ROOT)
+    set(APU_DDK_ROOT $ENV{APU_DDK_ROOT})
+    if(NOT APU_DDK_ROOT)
+        message(FATAL_ERROR "Must set APU_DDK_ROOT or env APU_DDK_ROOT when LITE_WITH_APU=ON")
+    endif()
+endif()
+
+message(STATUS "APU_DDK_ROOT: ${APU_DDK_ROOT}")
+find_path(APU_DDK_INC NAMES NeuronAdapter.h
+  PATHS ${APU_DDK_ROOT}/include NO_DEFAULT_PATH)
+if(NOT APU_DDK_INC)
+  message(FATAL_ERROR "Can not find NeuronAdapter.h in ${APU_DDK_ROOT}/include")
+endif()
+message(STATUS "APU_DDK_INC: ${APU_DDK_INC}")
+
+include_directories("${APU_DDK_ROOT}/include")
+
+set(APU_SUB_LIB_PATH "lib64")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(APU_SUB_LIB_PATH "lib64")
+endif()
+
+find_library(APU_NEURON_FILE NAMES neuron
+  PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
+
+find_library(APU_NEURON_ADAPTER_FILE NAMES neuron_adapter
+  PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
+
+if(NOT APU_NEURON_FILE)
+  message(FATAL_ERROR "Can not find APU_NEURON_FILE in ${APU_DDK_ROOT}")
+else()
+  message(STATUS "Found APU NEURON Library: ${APU_NEURON_FILE}")
+  add_library(apu_neuron SHARED IMPORTED GLOBAL)
+  set_property(TARGET apu_neuron PROPERTY IMPORTED_LOCATION ${APU_NEURON_FILE})
+endif()
+
+if(NOT APU_NEURON_ADAPTER_FILE)
+  message(FATAL_ERROR "Can not find APU_NEURON_ADAPTER_FILE in ${APU_DDK_ROOT}")
+else()
+  message(STATUS "Found APU NEURON ADAPTER Library: ${APU_NEURON_ADAPTER_FILE}")
+  add_library(apu_neuron_adapter SHARED IMPORTED GLOBAL)
+  set_property(TARGET apu_neuron_adapter PROPERTY IMPORTED_LOCATION ${APU_NEURON_ADAPTER_FILE})
+endif()
+
+set(apu_runtime_libs apu_neuron apu_neuron_adapter CACHE INTERNAL "apu runtime libs")
+message(STATUS "${apu_runtime_libs}")
+
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -88,6 +88,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_APU)
+    foreach(var ${lite_deps_APU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  if (LITE_WITH_RKNPU)
    foreach(var ${lite_deps_RKNPU_DEPS})
      set(deps ${deps} ${var})
@@ -137,7 +143,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -153,6 +159,7 @@ function(lite_cc_library TARGET)
            CV_DEPS ${args_CV_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
+            APU_DEPS ${args_APU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -186,7 +193,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -199,6 +206,7 @@ function(lite_cc_binary TARGET)
            ARM_DEPS ${args_ARM_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
+            APU_DEPS ${args_APU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
            RKNPU_DEPS ${args_RKNPU_DEPS}
            BM_DEPS ${args_BM_DEPS}
@@ -238,7 +246,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -259,6 +267,7 @@ function(lite_cc_test TARGET)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
@@ -292,6 +301,7 @@ set(x86_kernels CACHE INTERNAL "x86 kernels")
 set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
+set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
@@ -311,12 +321,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM, RKNPU)
+# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -374,6 +384,15 @@ function(add_kernel TARGET device level)
        endif()
        set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "APU")
+        if (NOT LITE_WITH_APU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(apu_kernels "${apu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "XPU")
        if (NOT LITE_WITH_XPU)
            foreach(src ${args_SRCS})
@@ -457,6 +476,7 @@ function(add_kernel TARGET device level)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
@@ -479,7 +499,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -512,6 +532,7 @@ function(add_operator TARGET level)
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
+              APU_DEPS ${args_APU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
+message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
 message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
@@ -71,6 +72,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
    if (LITE_WITH_XPU)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu")
    endif(LITE_WITH_XPU)
+    if (LITE_WITH_APU)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.apu")
+    endif(LITE_WITH_APU)
    if (LITE_WITH_FPGA)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
    endif(LITE_WITH_FPGA)

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -37,11 +37,12 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                  ARM_DEPS ${arm_kernels}
                  CV_DEPS paddle_cv_arm
                  NPU_DEPS ${npu_kernels}
+                  APU_DEPS ${apu_kernels}
                  RKNPU_DEPS ${rknpu_kernels}
                  )

    add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels})
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
    if(NOT APPLE AND NOT WIN32)
        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
        set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
@@ -81,7 +82,9 @@ if (WITH_TESTING)
      XPU_DEPS ${xpu_kernels}
      RKNPU_DEPS ${rknpu_kernels}
      BM_DEPS ${bm_kernels}
-      MLU_DEPS ${mlu_kernels})
+      MLU_DEPS ${mlu_kernels}
+      APU_DEPS ${apu_kernels})
+
 endif()
 if(LITE_WITH_FPGA)
    set(light_api_deps ${light_api_deps} ${fpga_deps})
@@ -106,6 +109,7 @@ message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get OpenCL kernels ${opencl_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
+message(STATUS "get APU kernels ${apu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
@@ -125,6 +129,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        CV_DEPS paddle_cv_arm
                        NPU_DEPS ${npu_kernels}
                        XPU_DEPS ${xpu_kernels}
+                        APU_DEPS ${apu_kernels}
                        RKNPU_DEPS ${rknpu_kernels}
                        BM_DEPS ${bm_kernels}
                        CL_DEPS ${opencl_kernels}
@@ -146,6 +151,7 @@ lite_cc_library(light_api SRCS light_api.cc
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
        XPU_DEPS ${xpu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
        CL_DEPS ${opencl_kernels}
@@ -166,6 +172,7 @@ if(WITH_TESTING)
       ARM_DEPS ${arm_kernels}
       CV_DEPS paddle_cv_arm
       NPU_DEPS ${npu_kernels}
+       APU_DEPS ${apu_kernels}
       XPU_DEPS ${xpu_kernels}
       RKNPU_DEPS ${rknpu_kernels}
       CL_DEPS ${opencl_kernels}
@@ -223,7 +230,7 @@ if(WITH_TESTING)
 endif()

 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
-    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${fpga_kernels})
+    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels})

    lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc
       DEPS ${lite_model_test_DEPS}
@@ -295,6 +302,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels})
@@ -359,6 +367,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
  CV_DEPS paddle_cv_arm
  NPU_DEPS ${npu_kernels}
  XPU_DEPS ${xpu_kernels}
+  APU_DEPS ${apu_kernels}
  RKNPU_DEPS ${rknpu_kernels}
  CL_DEPS ${opencl_kernels}
  X86_DEPS ${x86_kernels}
@@ -379,6 +388,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        RKNPU_DEPS ${rknpu_kernels}
@@ -393,6 +403,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        RKNPU_DEPS ${rknpu_kernels}
@@ -407,6 +418,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        RKNPU_DEPS ${rknpu_kernels}
@@ -422,6 +434,7 @@ if(NOT IOS)
        XPU_DEPS ${xpu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
@@ -432,6 +445,7 @@ if(NOT IOS)
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        APU_DEPS ${apu_kernels}
        XPU_DEPS ${xpu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
@@ -448,6 +462,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        RKNPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -108,6 +108,9 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
    } else if (target_repr == "npu") {
      valid_places.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "apu") {
+      valid_places.emplace_back(
+          Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
    } else if (target_repr == "rknpu") {
@@ -191,6 +194,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                      "kOpenCL",
                                      "kFPGA",
                                      "kNPU",
+                                      "kAPU",
                                      "kXPU",
                                      "kRKNPU",
                                      "kAny",
@@ -257,16 +261,16 @@ void PrintHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
      "  Display operators in the input model\n";
  std::cout << "opt version:" << opt_version << std::endl
            << help_info << std::endl;

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -73,7 +73,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "xpu",
                                              "bm",
                                              "mlu",
-                                              "rknpu"};
+                                              "rknpu",
+                                              "apu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -113,9 +114,10 @@ const std::string& TargetRepr(TargetType target) {
                                              "kFPGA",
                                              "kNPU",
                                              "kXPU",
-                                              "kMLU",
                                              "kBM",
-                                              "kRKNPU"};
+                                              "kMLU",
+                                              "kRKNPU",
+                                              "kAPU"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -158,6 +160,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kXPU),
                                               TARGET(kBM),
                                               TARGET(kMLU),
+                                               TARGET(kAPU),
                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -49,14 +49,15 @@ enum class TargetType : int {
  kCUDA = 3,
  kARM = 4,
  kOpenCL = 5,
+  kAny = 6,  // any target
  kFPGA = 7,
  kNPU = 8,
  kXPU = 9,
  kBM = 10,
  kMLU = 11,
  kRKNPU = 12,
-  kAny = 6,  // any target
-  NUM = 13,  // number of fields.
+  kAPU = 13,
+  NUM = 14,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -49,6 +49,7 @@ USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(mlu_subgraph_pass);
 USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
+USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -183,6 +183,7 @@ void BindLitePlace(py::module *m) {
      .value("FPGA", TargetType::kFPGA)
      .value("NPU", TargetType::kNPU)
      .value("MLU", TargetType::kMLU)
+      .value("APU", TargetType::kAPU)
      .value("Any", TargetType::kAny);

  // PrecisionType

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -8,4 +8,5 @@ add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
 add_subdirectory(bm)
+add_subdirectory(apu)
 add_subdirectory(rknpu)
--- a/lite/backends/apu/CMakeLists.txt
+++ b/lite/backends/apu/CMakeLists.txt
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+
+lite_cc_library(device_apu SRCS device.cc)
--- a/lite/backends/apu/device.cc
+++ b/lite/backends/apu/device.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/apu/device.h"
+#include <dlfcn.h>
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace apu {
+
+inline void* LoadFunc(void* libHandle, const char* name) {
+  CHECK(libHandle != nullptr);
+  CHECK(name != nullptr);
+  void* fn = dlsym(libHandle, name);
+  if (fn == nullptr) {
+    LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
+                 << "] Because " << dlerror();
+  }
+  return fn;
+}
+
+NeuronCompilation* Device::Build(void* libHandle, NeuronModel* model) {
+  typedef int (*NeuronCompilation_create)(NeuronModel * model,
+                                          NeuronCompilation * *compilation);
+  typedef void (*NeuronCompilation_free)(NeuronCompilation * compilation);
+  typedef int (*NeuronCompilation_finish)(NeuronCompilation * compilation);
+
+#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
+  FUNC_NAME VARIABLE_NAME =                                 \
+      reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_create, neuron_compilation_create)
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_free, neuron_compilation_free)
+  LOAD_FUNCTIONS(libHandle, NeuronCompilation_finish, neuron_compilation_finish)
+#undef LOAD_FUNCTIONS
+
+  int neuron_errCode = 0;
+  NeuronCompilation* compilation = NULL;
+
+  VLOG(3) << "[APU] Compile model";
+
+  neuron_errCode = (*neuron_compilation_create)(model, &compilation);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "[APU] create compile failed! " << neuron_errCode;
+    return nullptr;
+  }
+
+  neuron_errCode = (*neuron_compilation_finish)(compilation);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "[APU] compile failed! " << neuron_errCode;
+    return nullptr;
+  }
+
+  VLOG(3) << "[APU] Build done";
+  return compilation;
+}
+
+}  // namespace apu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/apu/device.h
+++ b/lite/backends/apu/device.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "NeuronAdapter.h"  // NOLINT
+
+namespace paddle {
+namespace lite {
+namespace apu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {}
+
+  NeuronCompilation* Build(void* libHandle, NeuronModel* model);
+};
+
+}  // namespace apu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -54,6 +54,7 @@ using HostContext = Context<TargetType::kHost>;
 using X86Context = Context<TargetType::kX86>;
 using ARMContext = Context<TargetType::kARM>;
 using NPUContext = Context<TargetType::kNPU>;
+using APUContext = Context<TargetType::kAPU>;
 using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
@@ -87,6 +88,21 @@ class Context<TargetType::kNPU> {
 };
 #endif

+#ifdef LITE_WITH_APU
+template <>
+class Context<TargetType::kAPU> {
+ public:
+  Context() {}
+  explicit Context(const APUContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(APUContext* ctx) {}
+
+  APUContext& operator=(const APUContext& ctx) {}
+  std::string name() const { return "APUContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_BM
 template <>
 class Context<TargetType::kBM> {
@@ -408,6 +424,12 @@ class ContextScheduler {
            &ctx->As<NPUContext>());
        break;
 #endif
+#ifdef LITE_WITH_APU
+      case TARGET(kAPU):
+        kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
+            &ctx->As<APUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_RKNPU
      case TARGET(kRKNPU):
        kernel_contexts_[TargetType::kRKNPU].As<RKNPUContext>().CopySharedTo(
@@ -483,6 +505,9 @@ class ContextScheduler {
 #ifdef LITE_WITH_NPU
    InitContext<TargetType::kNPU, NPUContext>();
 #endif
+#ifdef LITE_WITH_APU
+    InitContext<TargetType::kAPU, APUContext>();
+#endif
 #ifdef LITE_WITH_RKNPU
    InitContext<TargetType::kRKNPU, RKNPUContext>();
 #endif

--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -313,4 +313,8 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
    .BindTargets({TARGET(kARM), TARGET(kOpenCL)})
-    .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM), TARGET(kRKNPU)});
+    .ExcludeTargets({TARGET(kNPU),
+                     TARGET(kXPU),
+                     TARGET(kBM),
+                     TARGET(kRKNPU),
+                     TARGET(kAPU)});
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply(

 REGISTER_MIR_PASS(quantized_op_attributes_inference_pass,
                  paddle::lite::mir::QuantizedOpAttributesInferencePass)
-    .BindTargets({TARGET(kNPU), TARGET(kRKNPU)});
+    .BindTargets({TARGET(kAPU), TARGET(kRKNPU)});
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -40,6 +40,22 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }

+void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) \
+  supported_lists.insert(#op_type);          \
+  LOG(INFO) << #op_type
+#include "lite/kernels/apu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
  std::unordered_set<std::string> supported_lists;
@@ -103,6 +119,8 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
    .BindTargets({TARGET(kNPU)});
+REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass)
+    .BindTargets({TARGET(kAPU)});
 REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
    .BindTargets({TARGET(kXPU)});
 REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)

--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };

+class APUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 class XPUSubgraphPass : public ProgramPass {
 public:
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;

--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -98,6 +98,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
    case TARGET(kNPU): {
      CREATE_KERNEL(kNPU);
    } break;
+    case TARGET(kAPU): {
+      CREATE_KERNEL(kAPU);
+    } break;
    case TARGET(kXPU): {
      CREATE_KERNEL(kXPU);
    } break;
@@ -220,6 +223,7 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kNPU, kAny, kNHWC);
  INIT_FOR(kNPU, kAny, kAny);

+  INIT_FOR(kAPU, kInt8, kNCHW);
  INIT_FOR(kXPU, kFloat, kNCHW);
  INIT_FOR(kXPU, kInt8, kNCHW);
  INIT_FOR(kXPU, kAny, kNCHW);

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -231,6 +231,9 @@ class KernelRegistry final {
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //

+              KernelRegistryForTarget<TARGET(kAPU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kXPU),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -86,6 +86,7 @@ class Optimizer {
           "npu_subgraph_pass",
           "xpu_subgraph_pass",
           "bm_subgraph_pass",
+           "apu_subgraph_pass",
           "rknpu_subgraph_pass",
           "static_kernel_pick_pass",        // pick original kernel from graph
           "variable_place_inference_pass",  // inference arg/var's

--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -11,5 +11,6 @@ add_subdirectory(fpga)
 add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
+add_subdirectory(apu)
 add_subdirectory(bm)
 add_subdirectory(rknpu)
--- a/lite/kernels/apu/CMakeLists.txt
+++ b/lite/kernels/apu/CMakeLists.txt
+add_subdirectory(bridges)
+
+add_kernel(subgraph_compute_apu APU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_apu subgraph_bridge_engine ${apu_subgraph_bridges})
--- a/lite/kernels/apu/bridges/CMakeLists.txt
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
+if(NOT LITE_WITH_APU)
+  return()
+endif()
+
+
+lite_cc_library(subgraph_bridge_utility_apu SRCS utility.cc DEPS tensor)
+lite_cc_library(subgraph_bridge_graph_apu SRCS graph.cc DEPS subgraph_bridge_utility_apu)
+
+set(apu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_apu subgraph_bridge_graph_apu)
+
+lite_cc_library(subgraph_bridge_conv_op_apu SRCS conv_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_apu SRCS elementwise_ops.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
+
+
+set(apu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_apu
+        subgraph_bridge_conv_op_apu
+        subgraph_bridge_elementwise_ops_apu
+        subgraph_bridge_act_op_apu
+        subgraph_bridge_softmax_op_apu
+        subgraph_bridge_fc_op_apu
+        subgraph_bridge_pool_op_apu
+        CACHE INTERNAL "apu_subgraph_bridges")
+
+message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
--- a/lite/kernels/apu/bridges/act_op.cc
+++ b/lite/kernels/apu/bridges/act_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+
+  return SUCCESS;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(relu, kAPU, paddle::lite::subgraph::apu::ActConverter);
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
--- a/lite/kernels/apu/bridges/elementwise_ops.cc
+++ b/lite/kernels/apu/bridges/elementwise_ops.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto y_name = op_info->Input("Y").front();
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+  auto axis = op_info->GetAttr<int>("axis");
+
+  // Act node
+  if (op_type == "fusion_elementwise_add_activation" ||
+      op_type == "fusion_elementwise_sub_activation" ||
+      op_type == "fusion_elementwise_mul_activation" ||
+      op_type == "fusion_elementwise_div_activation") {
+    auto act_type = op_info->GetAttr<std::string>("act_type");
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ElementwiseConverter);
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting [" + op_type + "]";
+
+  auto libHandle = graph->libHandle();
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand)
+  LOAD_FUNCTIONS(
+      libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue)
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation)
+
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  CHECK_GE(input_dims.size(), 2UL);
+  auto w_name = op_info->Input("W").front();
+  auto w = scope->FindMutableTensor(w_name);
+  auto w_dims = w->dims();
+  CHECK_EQ(w_dims.size(), 2UL);
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+
+  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  int m = input_dims.Slice(0, in_num_col_dims).production();
+  int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
+  int n = w_dims[1];
+  CHECK_EQ(k * n, w_dims.production());
+  VLOG(3) << "[APU] input dims: " << input_dims << " w dims: " << w_dims
+          << " out_dims: " << out_dims << " m: " << m << " k: " << k
+          << " n: " << n;
+
+  float input_scale = 1.0f;
+  float out_scale = 1.0f;
+  std::vector<float> w_scale;
+  if (op_info->HasAttr("enable_int8")) {
+    if (op_info->GetAttr<bool>("enable_int8")) {
+      if (op_info->HasAttr("input_scale"))
+        input_scale = op_info->GetAttr<float>("input_scale");
+      if (op_info->HasAttr("weight_scale"))
+        w_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+      if (op_info->HasAttr("output_scale"))
+        out_scale = op_info->GetAttr<float>("output_scale");
+    } else {
+      return FAILED;
+    }
+  } else {
+    return FAILED;
+  }
+
+  // Add input tensor type
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = input_scale;
+  inType.zeroPoint = 128;
+  inType.dimensionCount = input_dims.size();
+  std::vector<uint32_t> dims_in = {(uint32_t)input_dims[0],
+                                   (uint32_t)input_dims[2],
+                                   (uint32_t)input_dims[3],
+                                   (uint32_t)input_dims[1]};
+
+  inType.dimensions = &dims_in[0];
+  std::shared_ptr<Node> in_node = nullptr;
+  if (graph->Has(input_name)) {
+    // input operand already exist
+    in_node = graph->Get(input_name);
+    VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index();
+  } else {
+    // add input operand
+    (*neuron_model_addOperand)(model, &inType);  // 0: input
+    in_node = graph->Add(input_name, dims_in);
+  }
+  VLOG(3) << "input_scale: " << input_scale
+          << ", inType: " << inType.dimensions[0] << " : "
+          << inType.dimensions[1] << " : " << inType.dimensions[2] << " : "
+          << inType.dimensions[3];
+
+  NeuronOperandType wType;
+  wType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  wType.scale = w_scale[0];
+  wType.zeroPoint = 128;
+  wType.dimensionCount = w_dims.size();
+  std::vector<uint32_t> dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]};
+  wType.dimensions = &dims_w[0];
+  (*neuron_model_addOperand)(model, &wType);  // 1: weight
+  std::shared_ptr<Node> w_node = nullptr;
+  w_node = graph->Add(w_name, dims_w);
+  VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0]
+          << ", wType dimensions: " << wType.dimensions[0] << " : "
+          << wType.dimensions[1] << ", memory size: " << w->memory_size();
+
+  // Add bias type
+  NeuronOperandType biasType;
+  biasType.type = NEURON_TENSOR_INT32;
+  biasType.zeroPoint = 0;
+  biasType.scale = input_scale * w_scale[0];
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias_type = kernel->GetInputDeclType("Bias");
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+
+    biasType.dimensionCount = bias_dims.size();
+    std::vector<uint32_t> dims_bias = {(uint32_t)bias_dims[0]};
+    biasType.dimensions = &dims_bias[0];
+    (*neuron_model_addOperand)(model, &biasType);  // 2: bias
+    bias_node = graph->Add(bias_name, dims_bias);
+    VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims
+            << ", bias scale: " << biasType.scale
+            << " ,memory size: " << bias->memory_size();
+  } else {
+    biasType.dimensionCount = 1;
+    std::vector<uint32_t> dims_bias = {(uint32_t)n};
+    biasType.dimensions = &dims_bias[0];
+    (*neuron_model_addOperand)(model, &biasType);  // 2: bias
+    bias_node = graph->Add(w_name + "_default_bias", dims_bias);
+  }
+
+  // Add fuse type
+  NeuronOperandType fuseType;
+  fuseType.type = NEURON_INT32;
+  fuseType.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {0};
+  (*neuron_model_addOperand)(model, &fuseType);  // 3: fuse
+  std::shared_ptr<Node> fuse_node = nullptr;
+  fuse_node = graph->Add(w_name + "_fuse", dims_int32);
+
+  // Add output tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = out_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = 2;
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0], out_dims[1]};
+  outType.dimensions = &dims_out[0];
+  VLOG(3) << "out_scale: " << out_scale
+          << ", outType: " << outType.dimensions[0] << " : "
+          << outType.dimensions[1];
+  (*neuron_model_addOperand)(model, &outType);  // output
+  std::shared_ptr<Node> out_node = nullptr;
+  out_node = graph->Add(out_name, dims_out);
+
+  int8_t* w_data = w->mutable_data<int8_t>();
+  Tensor transpose_filter;
+  // Original dimension
+  transpose_filter.Resize({(uint32_t)w_dims[1], (uint32_t)w_dims[0]});
+  transpose_filter.mutable_data<uint8_t>();
+  transposeAsym(w->data<int8_t>(),
+                transpose_filter.mutable_data<uint8_t>(),
+                {1, 1, (uint32_t)w_dims[0], (uint32_t)w_dims[1]},
+                {0, 1, 3, 2});
+  memcpy(w->mutable_data<int8_t>(),
+         transpose_filter.mutable_data<uint8_t>(),
+         w->memory_size());
+  int neuron_errCode = (*neuron_model_setOperandValue)(
+      model, w_node->index(), w->raw_data(), w->memory_size());
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set W operand value fail:" << neuron_errCode
+                 << ",index: " << w_node->index();
+    return FAILED;
+  }
+
+  // Add bias if bias tensor exists
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+    int32_t* int32_bias_data =
+        reinterpret_cast<int32_t*>(bias->mutable_data<float>());
+    float2int32(bias->data<float>(), input_scale, w_scale, int32_bias_data);
+
+    VLOG(3) << int32_bias_data[0] << ":" << int32_bias_data[1] << ":"
+            << int32_bias_data[2] << ":" << int32_bias_data[3];
+    neuron_errCode =
+        (*neuron_model_setOperandValue)(model,
+                                        bias_node->index(),
+                                        bias->raw_data(),
+                                        bias->memory_size());  // 2: bias
+  } else {
+    auto int32_bias = std::make_shared<Tensor>();
+    int32_bias->Resize({1, out_dims[1]});
+    int32_bias->mutable_data<int32_t>();
+    memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
+    VLOG(3) << "default: " << int32_bias->memory_size();
+    neuron_errCode =
+        (*neuron_model_setOperandValue)(model,
+                                        bias_node->index(),
+                                        int32_bias->raw_data(),
+                                        int32_bias->memory_size());  // 2: bias
+    bias_node->set_data(int32_bias);
+  }
+  // Add fuse value
+  int32_t fuse_val[1] = {0};
+  (*neuron_model_setOperandValue)(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);  // 3: fuse
+
+  std::vector<uint32_t> addInIndex = {in_node->index(),
+                                      w_node->index(),
+                                      bias_node->index(),
+                                      fuse_node->index()};
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  neuron_errCode = (*neuron_model_addOperation)(model,
+                                                NEURON_FULLY_CONNECTED,
+                                                addInIndex.size(),
+                                                &addInIndex[0],
+                                                addOutIndex.size(),
+                                                &addOutIndex[0]);
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return FAILED;
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(fc, kAPU, paddle::lite::subgraph::apu::FCConverter);
--- a/lite/kernels/apu/bridges/graph.cc
+++ b/lite/kernels/apu/bridges/graph.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/apu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+
+  if (it != nodes_.end()) {
+    LOG(FATAL) << "[APU] Node" << name << " is redefined.";
+    return -1;
+  } else {
+    VLOG(3) << " Add: " << name << " : " << node->index();
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  operandIdx_ += 1;
+  it->second.push_back(node);
+
+  return it->second.size();
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/apu/bridges/graph.h
+++ b/lite/kernels/apu/bridges/graph.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "NeuronAdapter.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+// Graph and node is defined to collect all of converted HiAI IR nodes
+class Node {
+ public:
+  Node(int32_t operand_idx, std::vector<uint32_t> shape)
+      : idx_(operand_idx), shape_(shape) {}
+
+  void set_shape(std::vector<uint32_t> shape) { shape_ = shape; }
+
+  uint32_t index() { return idx_; }
+  std::vector<uint32_t> shape() const { return shape_; }
+  void set_data(std::shared_ptr<Tensor> data) { data_ = data; }
+
+ private:
+  int32_t idx_;
+  std::vector<uint32_t> shape_;
+  std::shared_ptr<Tensor> data_{nullptr};
+};
+
+class Graph {
+ public:
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Variable, const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<uint32_t> shape) {
+    CHECK(shape.size()) << name << " : " << shape.size();
+    auto node = std::make_shared<Node>(operandIdx_, shape);
+    auto idx = Add(name, node);
+    CHECK_GE(idx, 1);
+
+    return node;
+  }
+
+  void set_model(NeuronModel* model) { model_ = model; }
+  NeuronModel* model() { return model_; }
+
+  void set_libHandle(void* libHandle) { libHandle_ = libHandle; }
+  void* libHandle() { return libHandle_; }
+
+  void set_input_names(const std::vector<std::string> input_names) {
+    input_names_ = input_names;
+  }
+
+  bool IsInput(const std::string& name) {
+    for (int i = 0; i < input_names_.size(); i++) {
+      if (input_names_[i] == name) return true;
+    }
+    return false;
+  }
+
+  bool IsOutput(const std::string& name) {
+    for (int i = 0; i < output_names_.size(); i++) {
+      if (output_names_[i] == name) return true;
+    }
+    return false;
+  }
+
+  void set_output_names(const std::vector<std::string> output_names) {
+    output_names_ = output_names;
+  }
+
+  std::shared_ptr<Node> Get(std::string name) {
+    CHECK(Has(name)) << "[APU] Node " << name << " not found.";
+    return nodes_.at(name).back();
+  }
+
+  bool Has(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+ private:
+  void* libHandle_;
+  NeuronModel* model_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
+  int32_t operandIdx_ = 0;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
+};
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/apu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/apu/bridges/paddle_use_bridges.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(relu, kAPU);
+USE_SUBGRAPH_BRIDGE(conv2d, kAPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kAPU);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kAPU);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU);
+USE_SUBGRAPH_BRIDGE(fc, kAPU);
+USE_SUBGRAPH_BRIDGE(pool2d, kAPU);
+USE_SUBGRAPH_BRIDGE(softmax, kAPU);
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pool_op.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting [" + op_type + "] ";
+
+  auto libHandle = graph->libHandle();
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand)
+  LOAD_FUNCTIONS(
+      libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue)
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation)
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+
+  // pool mode
+  if ((pooling_type == "max") || (pooling_type == "avg")) {
+  } else {
+    LOG(WARNING) << "[APU] Unsupported pooling type: " << pooling_type;
+    return FAILED;
+  }
+
+  // pad mode
+  int pad_mode = 0;
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  if (padding_algorithm == "SAME") {
+    pad_mode = 6;
+  } else if (padding_algorithm == "VALID") {
+    pad_mode = 5;
+  }
+
+  // paddings and strides
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[APU] Paddings size should be the same or twice as the inputs size.";
+
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  lite::operators::UpdatePadding(&paddings,
+                                 global_pooling,
+                                 adaptive,
+                                 padding_algorithm,
+                                 x->dims(),
+                                 strides,
+                                 ksize);
+
+  // Add x tensor type
+  float x_scale = 1.0f;
+  float out_scale = 1.0f;
+  if (op_info->HasAttr("enable_int8")) {
+    if (op_info->GetAttr<bool>("enable_int8")) {
+      if (op_info->HasAttr("input_scale"))
+        x_scale = op_info->GetAttr<float>("input_scale");
+      if (op_info->HasAttr("output_scale"))
+        out_scale = op_info->GetAttr<float>("output_scale");
+    } else {
+      LOG(WARNING) << "Do not enable_int8";
+      return FAILED;
+    }
+  } else {
+    LOG(WARNING) << "Do not enable_int8";
+    return FAILED;
+  }
+
+  NeuronOperandType xType;
+  xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  xType.scale = x_scale;
+  xType.zeroPoint = 128;
+  xType.dimensionCount = x_dims.size();
+  std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
+                                  (uint32_t)x_dims[2],
+                                  (uint32_t)x_dims[3],
+                                  (uint32_t)x_dims[1]};
+  xType.dimensions = &dims_x[0];
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    LOG(INFO) << "Graph has " << x_name;
+    // input operand already exist
+    x_node = graph->Get(x_name);
+  } else {
+    // add input operand
+    (*neuron_model_addOperand)(model, &xType);  // 0: x
+    x_node = graph->Add(x_name, dims_x);
+  }
+  VLOG(3) << "x_scale: " << x_scale << ", xType: " << xType.dimensions[0] << ":"
+          << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
+          << xType.dimensions[3];
+
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {0};
+
+  std::shared_ptr<Node> paddingL_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 1: padding left
+  paddingL_node = graph->Add(x_name + "_padding_left", dims_int32);
+
+  std::shared_ptr<Node> paddingR_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 2: padding right
+  paddingR_node = graph->Add(x_name + "_padding_right", dims_int32);
+
+  std::shared_ptr<Node> paddingT_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 3: padding top
+  paddingT_node = graph->Add(x_name + "_padding_top", dims_int32);
+
+  std::shared_ptr<Node> paddingB_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 4: padding bottom
+  paddingB_node = graph->Add(x_name + "_padding_bottom", dims_int32);
+
+  std::shared_ptr<Node> strideW_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 5: stride width
+  strideW_node = graph->Add(x_name + "_stride_width", dims_int32);
+
+  std::shared_ptr<Node> strideH_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 6: stride height
+  strideH_node = graph->Add(x_name + "_stride_height", dims_int32);
+
+  std::shared_ptr<Node> filterW_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 7: filter width
+  filterW_node = graph->Add(x_name + "_filter_width", dims_int32);
+
+  std::shared_ptr<Node> filterH_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 8: filter height
+  filterH_node = graph->Add(x_name + "_filter_height", dims_int32);
+
+  std::shared_ptr<Node> fuse_node = nullptr;
+  (*neuron_model_addOperand)(model, &int32Type);  // 9: fuse
+  fuse_node = graph->Add(x_name + "_fuse", dims_int32);
+
+  // Add out type
+  // Add output tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = out_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = out_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[2],
+                                    (uint32_t)out_dims[3],
+                                    (uint32_t)out_dims[1]};
+  outType.dimensions = &dims_out[0];
+  std::shared_ptr<Node> out_node = nullptr;
+  if (graph->Has(out_name)) {
+    out_node = graph->Get(out_name);
+  } else {
+    (*neuron_model_addOperand)(model, &outType);  // out
+    out_node = graph->Add(out_name, dims_out);
+  }
+  VLOG(3) << "output_scale: " << x_scale
+          << ", outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Add padding value
+  int32_t padding_val[1];
+  padding_val[0] = paddings[2];
+  (*neuron_model_setOperandValue)(
+      model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[3];
+  (*neuron_model_setOperandValue)(
+      model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[0];
+  (*neuron_model_setOperandValue)(
+      model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[1];
+  (*neuron_model_setOperandValue)(
+      model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1);
+
+  // Add Stride
+  int32_t stride_val[1];
+  stride_val[0] = strides[1];  // width
+  (*neuron_model_setOperandValue)(
+      model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
+  stride_val[0] = strides[0];  // height
+  (*neuron_model_setOperandValue)(
+      model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
+
+  // Add filter
+  int32_t filter_val[1];
+  filter_val[0] = global_pooling ? x_dims[3] : ksize[1];  // width
+  (*neuron_model_setOperandValue)(
+      model, filterW_node->index(), filter_val, sizeof(int32_t) * 1);
+  filter_val[0] = global_pooling ? x_dims[2] : ksize[0];  // height
+  (*neuron_model_setOperandValue)(
+      model, filterH_node->index(), filter_val, sizeof(int32_t) * 1);
+
+  // Add fuse
+  int32_t fuse_val[1] = {0};
+  (*neuron_model_setOperandValue)(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+
+  std::vector<uint32_t> addInIndex = {x_node->index(),
+                                      paddingL_node->index(),
+                                      paddingR_node->index(),
+                                      paddingT_node->index(),
+                                      paddingB_node->index(),
+                                      strideW_node->index(),
+                                      strideH_node->index(),
+                                      filterW_node->index(),
+                                      filterH_node->index(),
+                                      fuse_node->index()};
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+
+  int neuron_errCode;
+  if (pooling_type == "max") {
+    neuron_errCode = (*neuron_model_addOperation)(model,
+                                                  NEURON_MAX_POOL_2D,
+                                                  addInIndex.size(),
+                                                  &addInIndex[0],
+                                                  addOutIndex.size(),
+                                                  &addOutIndex[0]);
+  } else {
+    neuron_errCode = (*neuron_model_addOperation)(model,
+                                                  NEURON_AVERAGE_POOL_2D,
+                                                  addInIndex.size(),
+                                                  &addInIndex[0],
+                                                  addOutIndex.size(),
+                                                  &addOutIndex[0]);
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kAPU,
+                         paddle::lite::subgraph::apu::PoolConverter);
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[APU] Converting [" + op_type + "]";
+
+  auto libHandle = graph->libHandle();
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand)
+  LOAD_FUNCTIONS(
+      libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue)
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation)
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  CHECK_GE(x_dims.size(), 2UL);
+  auto x_rank = x_dims.size();
+  auto out_name = op_info->Output("Out").front();
+
+  // Check output shape
+  auto axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis += x_rank;
+  }
+
+  float input_scale = 1.0f;
+  float out_scale = 1.0f;
+  if (op_info->HasAttr("enable_int8")) {
+    if (op_info->GetAttr<bool>("enable_int8")) {
+      if (op_info->HasAttr("input_scale"))
+        input_scale = op_info->GetAttr<float>("input_scale");
+      if (op_info->HasAttr("output_scale"))
+        out_scale = op_info->GetAttr<float>("output_scale");
+    } else {
+      LOG(WARNING) << "Do not enable_int8";
+      return FAILED;
+    }
+  } else {
+    LOG(WARNING) << "Do not enable_int8";
+    return FAILED;
+  }
+
+  // Check output scale
+  NeuronOperandType xType;
+  xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  xType.scale = input_scale;
+  xType.zeroPoint = 128;
+  xType.dimensionCount = x_dims.size();
+  std::vector<uint32_t> dims_x;
+  for (int i = 0; i < x_dims.size(); i++) dims_x.push_back(x_dims[i]);
+  xType.dimensions = &dims_x[0];
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    // input operand already exist
+    x_node = graph->Get(x_name);
+    VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index();
+  } else {
+    // add input operand
+    (*neuron_model_addOperand)(model, &xType);  // 0: input
+    x_node = graph->Add(x_name, dims_x);
+  }
+  VLOG(3) << "input_scale size: " << input_scale
+          << " ,x_dims size: " << x_dims.size() << " ,x_dims: " << x_dims;
+
+  // Add beta operand
+  std::vector<uint32_t> dims_int32 = {0};
+  NeuronOperandType betaType;
+  betaType.type = NEURON_FLOAT32;
+  betaType.dimensionCount = 0;
+  (*neuron_model_addOperand)(model, &betaType);  // 1: beta
+  std::shared_ptr<Node> beta_node = nullptr;
+  beta_node = graph->Add(x_name + "_beta", dims_int32);
+
+  // Add axis operand
+  NeuronOperandType axisType;
+  axisType.type = NEURON_INT32;
+  axisType.dimensionCount = 0;
+  (*neuron_model_addOperand)(model, &axisType);  // 2: axis
+  std::shared_ptr<Node> axis_node = nullptr;
+  axis_node = graph->Add(x_name + "_axis", dims_int32);
+
+  // Add out operand
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = out_scale / 127;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = x_dims.size();
+  outType.dimensions = &dims_x[0];
+  (*neuron_model_addOperand)(model, &outType);  // 3: output
+  std::shared_ptr<Node> out_node = nullptr;
+  out_node = graph->Add(out_name, dims_x);
+  VLOG(3) << "output_scale: " << out_scale;
+
+  float beta_val[] = {1.0f};
+  (*neuron_model_setOperandValue)(
+      model, beta_node->index(), beta_val, sizeof(float) * 1);
+
+  int32_t axis_val[1];
+  axis_val[0] = axis;
+  (*neuron_model_setOperandValue)(
+      model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
+  std::vector<uint32_t> addInIndex = {
+      x_node->index(), beta_node->index(), axis_node->index()};
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  int neuron_errCode = (*neuron_model_addOperation)(model,
+                                                    NEURON_SOFTMAX,
+                                                    addInIndex.size(),
+                                                    &addInIndex[0],
+                                                    addOutIndex.size(),
+                                                    &addOutIndex[0]);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return FAILED;
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kAPU,
+                         paddle::lite::subgraph::apu::SoftmaxConverter);
--- a/lite/kernels/apu/bridges/utility.cc
+++ b/lite/kernels/apu/bridges/utility.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/bridges/utility.h"
+#include <utility>
+#include "lite/kernels/apu/bridges/graph.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+// typedef to the build functions pointer signatures
+typedef int (*Neuron_getVersion)(uint32_t* version);
+typedef int (*NeuronModel_create)(NeuronModel** model);
+typedef void (*NeuronModel_free)(NeuronModel* model);
+typedef int (*NeuronModel_finish)(NeuronModel* model);
+typedef int (*NeuronModel_addOperand)(NeuronModel* model,
+                                      const NeuronOperandType* type);
+typedef int (*NeuronModel_setOperandValue)(NeuronModel* model,
+                                           int32_t index,
+                                           const void* buffer,
+                                           size_t length);
+typedef int (*NeuronModel_addOperation)(NeuronModel* model,
+                                        NeuronOperationType type,
+                                        uint32_t inputCount,
+                                        const uint32_t* inputs,
+                                        uint32_t outputCount,
+                                        const uint32_t* outputs);
+typedef int (*NeuronModel_identifyInputsAndOutputs)(NeuronModel* model,
+                                                    uint32_t inputCount,
+                                                    const uint32_t* inputs,
+                                                    uint32_t outputCount,
+                                                    const uint32_t* outputs);
+typedef int (*NeuronModel_setOperandSymmPerChannelQuantParams)(
+    NeuronModel* model,
+    int32_t index,
+    const NeuronSymmPerChannelQuantParams* channelQuant);
+typedef int (*NeuronExecution_create)(NeuronCompilation* compilation,
+                                      NeuronExecution** execution);
+typedef void (*NeuronExecution_free)(NeuronExecution* execution);
+typedef int (*NeuronExecution_setInput)(NeuronExecution* execution,
+                                        int32_t index,
+                                        const NeuronOperandType* type,
+                                        const void* buffer,
+                                        size_t length);
+typedef int (*NeuronExecution_setOutput)(NeuronExecution* execution,
+                                         int32_t index,
+                                         const NeuronOperandType* type,
+                                         void* buffer,
+                                         size_t length);
+typedef int (*NeuronExecution_compute)(NeuronExecution* execution);
+
+void* LoadFunc(void* libHandle, const char* name) {
+  CHECK(libHandle != nullptr);
+  CHECK(name != nullptr);
+  void* fn = dlsym(libHandle, name);
+  if (fn == nullptr) {
+    LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
+                 << "] Because " << dlerror();
+  }
+  return fn;
+}
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+
+void insert_transpose_node(void* ctx,
+                           const std::string& input_name,
+                           const std::string& output_name,
+                           std::vector<uint32_t> input_shape,
+                           std::vector<uint32_t> output_shape,
+                           std::vector<int32_t> axis,
+                           float scale,
+                           int32_t zeroPoint) {
+  int neuron_errCode;
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto libHandle = graph->libHandle();
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand)
+  LOAD_FUNCTIONS(
+      libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue)
+  LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation)
+
+  // Add input
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = scale;
+  inType.zeroPoint = zeroPoint;
+  inType.dimensionCount = input_shape.size();
+  inType.dimensions = &input_shape[0];
+
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    VLOG(3) << "Has " << input_name;
+    input_node = graph->Get(input_name);
+  } else {
+    neuron_errCode = (*neuron_model_addOperand)(model, &inType);  // input
+    if (NEURON_NO_ERROR != neuron_errCode) {
+      LOG(WARNING) << "Insert transpose op fail!";
+      return;
+    }
+    VLOG(3) << "Add " << input_name;
+    input_node = graph->Add(input_name, input_shape);
+  }
+
+  // Add perm
+  NeuronOperandType permsType;
+  permsType.type = NEURON_TENSOR_INT32;
+  permsType.dimensionCount = 1;
+  uint32_t dims_perms[1] = {4};
+  permsType.dimensions = dims_perms;
+
+  neuron_errCode = (*neuron_model_addOperand)(model, &permsType);  // perm
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Insert transpose op fail!";
+    return;
+  }
+  std::shared_ptr<Node> perms_node = nullptr;
+  perms_node = graph->Add(input_name + "_perms", {4});
+
+  VLOG(3) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":"
+          << axis[3];
+  //  &axis[0], sizeof(int32_t) * axis.size());
+  neuron_errCode = (*neuron_model_setOperandValue)(
+      model, perms_node->index(), &axis[0], sizeof(int32_t) * axis.size());
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Insert transpose op fail!";
+    return;
+  }
+
+  // Add output
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = scale;
+  outType.zeroPoint = zeroPoint;
+  outType.dimensionCount = output_shape.size();
+  outType.dimensions = &output_shape[0];
+
+  (*neuron_model_addOperand)(model, &outType);  // output
+  std::shared_ptr<Node> output_node = nullptr;
+  output_node = graph->Add(output_name, output_shape);
+
+  std::vector<uint32_t> addInIndex = {input_node->index(),   // 0: input
+                                      perms_node->index()};  // 1: perm
+
+  std::vector<uint32_t> addOutIndex = {output_node->index()};
+
+  neuron_errCode = (*neuron_model_addOperation)(model,
+                                                NEURON_TRANSPOSE,
+                                                addInIndex.size(),
+                                                &addInIndex[0],
+                                                addOutIndex.size(),
+                                                &addOutIndex[0]);
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Insert transpose op fail!";
+  }
+}
+
+void transpose(const int8_t* input_data,
+               uint8_t* output_data,
+               std::vector<uint32_t> input_shape,
+               std::vector<uint32_t> axis) {
+  int old_index = -1;
+  int new_index = -1;
+  int dim[4] = {0};
+  std::vector<uint32_t> shape = input_shape;
+  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+          << ":" << input_shape[3];
+  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
+    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
+      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
+        for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) {
+          old_index = dim[0] * shape[1] * shape[2] * shape[3] +
+                      dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3];
+          new_index =
+              dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
+
+          output_data[new_index] = input_data[old_index];
+        }
+      }
+    }
+  }
+}
+
+void transposeAsym(const int8_t* input_data,
+                   uint8_t* output_data,
+                   std::vector<uint32_t> input_shape,
+                   std::vector<uint32_t> axis) {
+  int old_index = -1;
+  int new_index = -1;
+  int dim[4] = {0};
+  std::vector<uint32_t> shape = input_shape;
+  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+          << ":" << input_shape[3];
+  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
+    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
+      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
+        for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) {
+          old_index = dim[0] * shape[1] * shape[2] * shape[3] +
+                      dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3];
+          new_index =
+              dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
+
+          output_data[new_index] = input_data[old_index] + 128;  // per layer
+        }
+      }
+    }
+  }
+}
+
+void float2int32(const float* bias_data,
+                 float input_scale,
+                 std::vector<float> weight_scale,
+                 int32_t* int32_bias_data) {
+  for (int i = 0; i < weight_scale.size(); i++) {
+    int32_bias_data[i] = bias_data[i] / (input_scale * weight_scale[i]);
+  }
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <dlfcn.h>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "NeuronAdapter.h"
+#include "lite/core/op_lite.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+// typedef to the build functions pointer signatures
+typedef int (*Neuron_getVersion)(uint32_t* version);
+typedef int (*NeuronModel_create)(NeuronModel** model);
+typedef void (*NeuronModel_free)(NeuronModel* model);
+typedef int (*NeuronModel_finish)(NeuronModel* model);
+typedef int (*NeuronModel_addOperand)(NeuronModel* model,
+                                      const NeuronOperandType* type);
+typedef int (*NeuronModel_setOperandValue)(NeuronModel* model,
+                                           int32_t index,
+                                           const void* buffer,
+                                           size_t length);
+typedef int (*NeuronModel_addOperation)(NeuronModel* model,
+                                        NeuronOperationType type,
+                                        uint32_t inputCount,
+                                        const uint32_t* inputs,
+                                        uint32_t outputCount,
+                                        const uint32_t* outputs);
+typedef int (*NeuronModel_identifyInputsAndOutputs)(NeuronModel* model,
+                                                    uint32_t inputCount,
+                                                    const uint32_t* inputs,
+                                                    uint32_t outputCount,
+                                                    const uint32_t* outputs);
+typedef int (*NeuronModel_setOperandSymmPerChannelQuantParams)(
+    NeuronModel* model,
+    int32_t index,
+    const NeuronSymmPerChannelQuantParams* channelQuant);
+typedef int (*NeuronExecution_create)(NeuronCompilation* compilation,
+                                      NeuronExecution** execution);
+typedef void (*NeuronExecution_free)(NeuronExecution* execution);
+typedef int (*NeuronExecution_setInput)(NeuronExecution* execution,
+                                        int32_t index,
+                                        const NeuronOperandType* type,
+                                        const void* buffer,
+                                        size_t length);
+typedef int (*NeuronExecution_setOutput)(NeuronExecution* execution,
+                                         int32_t index,
+                                         const NeuronOperandType* type,
+                                         void* buffer,
+                                         size_t length);
+typedef int (*NeuronExecution_compute)(NeuronExecution* execution);
+
+void* LoadFunc(void* libHandle, const char* name);
+
+#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
+  FUNC_NAME VARIABLE_NAME =                                 \
+      reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
+
+// Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+void insert_transpose_node(void* ctx,
+                           const std::string& input_name,
+                           const std::string& output_name,
+                           std::vector<uint32_t> input_shape,
+                           std::vector<uint32_t> output_shape,
+                           std::vector<int32_t> axis,
+                           float scale,
+                           int32_t zeroPoint);
+
+void transpose(const int8_t* input_data,
+               uint8_t* output_data,
+               std::vector<uint32_t> input_shape,
+               std::vector<uint32_t> axis);
+
+void transposeAsym(const int8_t* input_data,
+                   uint8_t* output_data,
+                   std::vector<uint32_t> input_shape,
+                   std::vector<uint32_t> axis);
+
+void float2int32(const float* bias_data,
+                 float input_scale,
+                 std::vector<float> weight_scale,
+                 int32_t* int32_bias_data);
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/apu/subgraph_compute.h"
+#include <dlfcn.h>
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/backends/apu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/paddle_use_bridges.h"
+#include "lite/kernels/apu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace apu {
+
+inline void* LoadFunc(void* libHandle, const char* name) {
+  CHECK(libHandle != nullptr);
+  CHECK(name != nullptr);
+  void* fn = dlsym(libHandle, name);
+  if (fn == nullptr) {
+    LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
+                 << "] Because " << dlerror();
+  }
+  return fn;
+}
+
+#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
+  FUNC_NAME VARIABLE_NAME =                                 \
+      reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
+
+int SubgraphEngine::BuildDeviceProgram() {
+  typedef int (*Neuron_getVersion)(uint32_t * version);
+  typedef int (*NeuronModel_create)(NeuronModel * *model);
+  typedef void (*NeuronModel_free)(NeuronModel * model);
+  typedef int (*NeuronModel_finish)(NeuronModel * model);
+  typedef int (*NeuronModel_identifyInputsAndOutputs)(NeuronModel * model,
+                                                      uint32_t inputCount,
+                                                      const uint32_t* inputs,
+                                                      uint32_t outputCount,
+                                                      const uint32_t* outputs);
+
+  // Open the share library
+  libHandle_ = dlopen("libneuron_adapter.so", RTLD_LAZY);
+  if (libHandle_ == nullptr) {
+    LOG(WARNING) << "Failed to open libneuron_adapter.so. " << dlerror();
+    return subgraph::FAILED;
+  }
+
+  LOAD_FUNCTIONS(libHandle_, Neuron_getVersion, neuron_getVersion)
+  LOAD_FUNCTIONS(libHandle_, NeuronModel_create, neuron_model_create)
+  LOAD_FUNCTIONS(libHandle_, NeuronModel_finish, neuron_model_finish)
+  LOAD_FUNCTIONS(libHandle_,
+                 NeuronModel_identifyInputsAndOutputs,
+                 neuron_model_identifyInputsAndOutputs)
+
+  unsigned int version;
+  (*neuron_getVersion)(&version);
+  VLOG(3) << "Neuron Adapter version: " << version;
+
+  int status = 0;
+  subgraph::apu::Graph graph;
+  int neuron_errCode = (*neuron_model_create)(&model_);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Fail to create model";
+    return subgraph::FAILED;
+  }
+  graph.set_libHandle(libHandle_);
+  graph.set_model(model_);
+  graph.set_input_names(input_names_);
+  graph.set_output_names(output_names_);
+
+  // Convert all of ops and their input vars and weights and added into the APU
+  // NIR graph
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = const_cast<OpLite*>(inst.op());
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kAPU))) {
+      return subgraph::FAILED;
+    }
+
+    auto kernel = inst.kernel();
+    status |=
+        bridges.Select(op_type, TARGET(kAPU))(reinterpret_cast<void*>(&graph),
+                                              const_cast<OpLite*>(op),
+                                              const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+
+  // Get input tensor
+  std::vector<uint32_t> ins;
+  origin_itensors_.resize(input_names_.size());
+  origin_idims_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+    VLOG(3) << "subgraph input name: " << i << ", " << input_names_[i] << ":"
+            << origin_idims_[i].production();
+    // Get input index
+    int idx;
+    if (graph.Has(input_names_[i])) {
+      ins.push_back(graph.Get(input_names_[i])->index());
+      VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index();
+    } else {
+      LOG(WARNING) << "Fail to find input: " << input_names_[i];
+      return subgraph::FAILED;
+    }
+  }
+
+  // Get output tensor
+  std::vector<uint32_t> outs;
+  origin_otensors_.resize(output_names_.size());
+  origin_odims_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    VLOG(3) << "subgraph output name: " << i << ", " << output_names_[i] << ":"
+            << origin_odims_[i].production();
+    origin_otensors_[i]->mutable_data<int8_t>();
+    // Get input index
+    if (graph.Has(output_names_[i])) {
+      outs.push_back(graph.Get(output_names_[i])->index());
+      VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index();
+    } else {
+      LOG(WARNING) << "Fail to find output: " << output_names_[i];
+      return subgraph::FAILED;
+    }
+  }
+
+  VLOG(3) << "ins size: " << ins.size() << " outs size:" << outs.size();
+  // Set subgraph input/output
+  (*neuron_model_identifyInputsAndOutputs)(
+      model_, ins.size(), &ins[0], outs.size(), &outs[0]);
+  neuron_errCode = (*neuron_model_finish)(model_);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+  VLOG(3) << "[APU] APU NIR model created!";
+
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  auto start_time = GetCurrentUS();
+  compilation_ = lite::apu::Device::Global().Build(libHandle_, model_);
+  if (compilation_ == nullptr) {
+    LOG(WARNING) << "[APU] Build APU DLA model failed!";
+    return subgraph::FAILED;
+  }
+  VLOG(3) << "[APU] APU DLA model created, Build cost "
+          << GetCurrentUS() - start_time << " us";
+
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  typedef int (*NeuronExecution_create)(NeuronCompilation * compilation,
+                                        NeuronExecution * *execution);
+  typedef void (*NeuronExecution_free)(NeuronExecution * execution);
+  typedef int (*NeuronExecution_setInput)(NeuronExecution * execution,
+                                          int32_t index,
+                                          const NeuronOperandType* type,
+                                          const void* buffer,
+                                          size_t length);
+  typedef int (*NeuronExecution_setOutput)(NeuronExecution * execution,
+                                           int32_t index,
+                                           const NeuronOperandType* type,
+                                           void* buffer,
+                                           size_t length);
+  typedef int (*NeuronExecution_compute)(NeuronExecution * execution);
+
+  LOAD_FUNCTIONS(libHandle_, NeuronExecution_create, neuron_execution_create)
+  LOAD_FUNCTIONS(libHandle_, NeuronExecution_free, neuron_execution_free)
+  LOAD_FUNCTIONS(
+      libHandle_, NeuronExecution_setInput, neuron_execution_setInput)
+  LOAD_FUNCTIONS(
+      libHandle_, NeuronExecution_setOutput, neuron_execution_setOutput)
+  LOAD_FUNCTIONS(libHandle_, NeuronExecution_compute, neuron_execution_compute)
+
+  NeuronExecution* run1 = NULL;
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+
+  auto start_time = GetCurrentUS();
+  int neuron_errCode = (*neuron_execution_create)(compilation_, &run1);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "[APU] Build APU runtime failed!";
+    return subgraph::FAILED;
+  }
+
+  // Set input buffer
+  Tensor input_temp;
+  for (size_t i = 0; i < origin_itensors_.size(); i++) {
+    input_temp.Resize({origin_idims_[i]});
+    uint8_t* input_data = input_temp.mutable_data<uint8_t>();
+    memcpy(input_data,
+           origin_itensors_[i]->raw_data(),
+           origin_itensors_[i]->memory_size());
+    for (int j = 0; j < origin_itensors_[i]->data_size(); j++) {
+      input_data[j] += (uint8_t)128;
+    }
+    (*neuron_execution_setInput)(
+        run1, i, NULL, input_data, origin_itensors_[i]->memory_size());
+  }
+
+  // Set output buffer
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
+    (*neuron_execution_setOutput)(
+        run1,
+        i,
+        NULL,
+        reinterpret_cast<void*>(origin_otensors_[i]->raw_data()),
+        origin_otensors_[i]->memory_size());
+  }
+
+  neuron_errCode = (*neuron_execution_compute)(run1);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Fail to run execution!" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
+    int8_t* output_data = origin_otensors_[i]->mutable_data<int8_t>();
+    VLOG(3) << "output size:" << origin_otensors_[i]->memory_size();
+    for (int j = 0; j < origin_otensors_[i]->data_size(); j++) {
+      output_data[j] -= (int8_t)128;
+    }
+  }
+  (*neuron_execution_free)(run1);
+  VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
+  return 0;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace apu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kAPU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::apu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "NeuronAdapter.h"
+#include "lite/core/kernel.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace apu {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+
+  std::string model_name_;
+  void *libHandle_;
+  NeuronModel *model_;
+  NeuronCompilation *compilation_;
+};
+
+class SubgraphCompute
+    : public KernelLite<TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace apu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU AND NOT LITE_WITH_APU)
  return()
 endif()


--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
@@ -19,3 +19,10 @@ if(LITE_WITH_RKNPU)
      RKNPU_DEPS ${rknpu_kernels} ${rknpu_bridges}
      ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
 endif()
+
+if(LITE_WITH_APU)
+    lite_cc_test(test_mobilenetv1_int8_apu SRCS test_mobilenetv1_int8_apu.cc
+      DEPS ${lite_model_test_DEPS} paddle_api_full
+      APU_DEPS ${apu_kernels} ${apu_bridges}
+      ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
+endif()
--- a/lite/tests/api/test_mobilenetv1_int8_apu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_apu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+using namespace paddle::lite_api;  // NOLINT
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+inline int64_t ShapeProduction(std::vector<int64_t> shape) {
+  int64_t s = 1;
+  for (int64_t dim : shape) {
+    s *= dim;
+  }
+  return s;
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    std::cerr << "[ERROR] usage: ./" << argv[0]
+              << " model_dir [thread_num] [warmup_times] [repeat_times] "
+                 "[input_data_path] [output_data_path]"
+              << std::endl;
+    return -1;
+  }
+  std::string model_dir = argv[1];
+  int thread_num = 1;
+  if (argc > 2) {
+    thread_num = atoi(argv[2]);
+  }
+  int warmup_times = 5;
+  if (argc > 3) {
+    warmup_times = atoi(argv[3]);
+  }
+  int repeat_times = 10;
+  if (argc > 4) {
+    repeat_times = atoi(argv[4]);
+  }
+  std::string input_data_path;
+  if (argc > 5) {
+    input_data_path = argv[5];
+  }
+  std::string output_data_path;
+  if (argc > 6) {
+    output_data_path = argv[6];
+  }
+  paddle::lite_api::CxxConfig config;
+  config.set_model_dir(model_dir);
+  config.set_threads(thread_num);
+  config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH);
+  config.set_valid_places(
+      {paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}});
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  std::unique_ptr<paddle::lite_api::Tensor> input_tensor(
+      std::move(predictor->GetInput(0)));
+  input_tensor->Resize({1, 3, 224, 224});
+  auto input_data = input_tensor->mutable_data<float>();
+  auto input_size = ShapeProduction(input_tensor->shape());
+
+  // test loop
+  int total_imgs = 500;
+  float test_num = 0;
+  float top1_num = 0;
+  float top5_num = 0;
+  int output_len = 1000;
+  std::vector<int> index(1000);
+  bool debug = true;  // false;
+  int show_step = 500;
+  for (int i = 0; i < total_imgs; i++) {
+    // set input
+    std::string filename = input_data_path + "/" + std::to_string(i);
+    std::ifstream fs(filename, std::ifstream::binary);
+    if (!fs.is_open()) {
+      std::cout << "open input file fail.";
+    }
+    auto input_data_tmp = input_data;
+    for (int i = 0; i < input_size; ++i) {
+      fs.read(reinterpret_cast<char*>(input_data_tmp), sizeof(*input_data_tmp));
+      input_data_tmp++;
+    }
+    int label = 0;
+    fs.read(reinterpret_cast<char*>(&label), sizeof(label));
+    fs.close();
+
+    if (debug && i % show_step == 0) {
+      std::cout << "input data:" << std::endl;
+      std::cout << input_data[0] << " " << input_data[10] << " "
+                << input_data[input_size - 1] << std::endl;
+      std::cout << "label:" << label << std::endl;
+    }
+
+    // run
+    predictor->Run();
+    auto output0 = predictor->GetOutput(0);
+    auto output0_data = output0->data<float>();
+
+    // get output
+    std::iota(index.begin(), index.end(), 0);
+    sort(index.begin(), index.end(), [output0_data](size_t i1, size_t i2) {
+      return output0_data[i1] > output0_data[i2];
+    });
+    test_num++;
+    if (label == index[0]) {
+      top1_num++;
+    }
+    for (int i = 0; i < 5; i++) {
+      if (label == index[i]) {
+        top5_num++;
+      }
+    }
+
+    if (debug && i % show_step == 0) {
+      std::cout << index[0] << " " << index[1] << " " << index[2] << " "
+                << index[3] << " " << index[4] << std::endl;
+      std::cout << output0_data[index[0]] << " " << output0_data[index[1]]
+                << " " << output0_data[index[2]] << " "
+                << output0_data[index[3]] << " " << output0_data[index[4]]
+                << std::endl;
+      std::cout << output0_data[630] << std::endl;
+    }
+    if (i % show_step == 0) {
+      std::cout << "step " << i << "; top1 acc:" << top1_num / test_num
+                << "; top5 acc:" << top5_num / test_num << std::endl;
+    }
+  }
+  std::cout << "final result:" << std::endl;
+  std::cout << "top1 acc:" << top1_num / test_num << std::endl;
+  std::cout << "top5 acc:" << top5_num / test_num << std::endl;
+  return 0;
+}
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -27,6 +27,8 @@ NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.hua
 BUILD_XPU=OFF
 BUILD_XTCL=OFF
 XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/"
+BUILD_APU=OFF
+APU_DDK_ROOT="$(pwd)/apu_sdk_lib/"
 BUILD_RKNPU=OFF
 RKNPU_DDK_ROOT="$(pwd)/rknpu/"
 LITE_WITH_ARM_LANG=OFF
@@ -143,6 +145,8 @@ function make_tiny_publish_so {
      -DLITE_WITH_XPU=$BUILD_XPU \
      -DLITE_WITH_XTCL=$BUILD_XTCL \
      -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_APU=$BUILD_APU \
+      -DAPU_DDK_ROOT=$APU_DDK_ROOT \
      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
@@ -237,6 +241,8 @@ function make_full_publish_so {
      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
      -DLITE_WITH_TRAIN=$BUILD_TRAIN \
+      -DLITE_WITH_APU=$BUILD_APU \
+      -DAPU_DDK_ROOT=$APU_DDK_ROOT \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}

  make publish_inference -j$NUM_PROC
@@ -271,6 +277,8 @@ function make_all_tests {
      -DLITE_WITH_XPU=$BUILD_XPU \
      -DLITE_WITH_XTCL=$BUILD_XTCL \
      -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_APU=$BUILD_APU \
+      -DAPU_DDK_ROOT=$APU_DDK_ROOT \
      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
@@ -506,6 +514,14 @@ function main {
                XPU_SDK_ROOT="${i#*=}"
                shift
                ;;
+            --build_apu=*)
+                BUILD_APU="${i#*=}"
+                shift
+                ;;
+           --apu_ddk_root=*)
+                APU_DDK_ROOT="${i#*=}"
+                shift
+                ;;
            --build_rknpu=*)
                BUILD_RKNPU="${i#*=}"
                shift

--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -56,8 +56,8 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
 ops_lines = []

 # valid targets and valid_ops
-valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU"]
-valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[]]
+valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU"]
+valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
 class TargetType:
    kUnk = 0
    kHost = 1
@@ -65,13 +65,15 @@ class TargetType:
    kCUDA = 3
    kARM = 4
    kOpenCL = 5
+    kAny = 6  # any target
    kFPGA = 7
    kNPU = 8
    kXPU = 9
    kBM = 10
    kMLU = 11
    kRKNPU = 12
-    kAny = 6  # any target
+    kAPU = 13
+

 # record op_info of valid kernels into `valid_ops` according to different target type
 with open(kernels_list_path) as f: