NNA integration on develop branch

1, Remove a paragraph at the end of lite/CMakeLists.txt, otherwise build fails with original developbranch on Ubuntu 18.04; 2, Add NNA support, build light library successfully; 3, Full build, library ok, but apps such as benchmark_bin failed

NNA integration on develop branch
1, Remove a paragraph at the end of lite/CMakeLists.txt, otherwise build fails with original developbranch on Ubuntu 18.04; 2, Add NNA support, build light library successfully; 3, Full build, library ok, but apps such as benchmark_bin failed
d77b32fb · Jiansong Wang · 1a6880d6 · d77b32fb · d77b32fb · d77b32fb
41 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,6 +87,7 @@ lite_option(LITE_WITH_NPU       "Enable NPU in lite mode"  OFF)
 lite_option(LITE_WITH_RKNPU     "Enable RKNPU in lite mode"  OFF)
 lite_option(LITE_WITH_MLU       "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_HUAWEI_ASCEND_NPU  "Enable HUAWEI_ASCEND_NPU in lite mode"  OFF)
+lite_option(LITE_WITH_IMAGINATION        "Enable IMAGINATION_NNA in lite mode"  OFF)
 lite_option(LITE_WITH_XPU       "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_XTCL      "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM        "Enable BM in lite mode"   OFF)
@@ -171,6 +172,10 @@ if(LITE_WITH_RKNPU)
   include(device/rknpu)
 endif()

+if(LITE_WITH_NNA)
+	include(device/nna)
+endif()
+
 include(external/flatbuffers)

 # for mobile

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -175,6 +175,10 @@ if (LITE_WITH_MLU)
 add_definitions("-DLITE_WITH_MLU")
 endif()

+if (LITE_WITH_NNA)
+add_definitions("-DLITE_WITH_NNA")
+endif()
+
 if (LITE_WITH_HUAWEI_ASCEND_NPU)
 add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU")
 endif()

--- a/cmake/device/nna.cmake
+++ b/cmake/device/nna.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_NNA)
+  return()
+endif()
+
+if(NOT DEFINED IMGNNA_DDK_ROOT)
+  set(IMGNNA_DDK_ROOT $ENV{IMGNNA_DDK_ROOT})
+  if(NOT IMGNNA_DDK_ROOT)
+    message(FATAL_ERROR "Must set IMGNNA_DDK_ROOT or env IMGNNA_DDK_ROOT when LITE_WITH_IMGNNA=ON")
+  endif()
+endif()
+
+message(STATUS "IMGNNA_DDK_ROOT: ${IMGNNA_DDK_ROOT}")
+find_path(IMGNNA_DDK_INC NAMES imgdnn.h
+  PATHS ${IMGNNA_DDK_ROOT}/include/imgdnn NO_DEFAULT_PATH)
+if(NOT IMGNNA_DDK_INC)
+  message(FATAL_ERROR "Can not find imgdnn.h in ${IMGNNA_DDK_ROOT}/include")
+endif()
+
+#include_directories("${IMGNNA_DDK_ROOT}/include")
+include_directories(${IMGNNA_DDK_INC})
+
+#set(IMGNNA_SUB_LIB_PATH "lib64")
+#if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+#    set(IMGNNA_SUB_LIB_PATH "lib64")
+#endif()
+#if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+#    set(IMGNNA_SUB_LIB_PATH "lib")
+#endif()
+set(IMGNNA_LIB_PATH "lib")
+
+find_library(IMGNNA_DDK_IMGDNN_FILE NAMES imgdnn
+  PATHS ${IMGNNA_DDK_ROOT}/${IMGNNA_LIB_PATH})
+
+if(NOT IMGNNA_DDK_IMGDNN_FILE)
+  message(FATAL_ERROR "Can not find IMGNNA_DDK_IMGDNN_FILE in ${IMGNNA_DDK_ROOT}")
+else()
+  message(STATUS "Found IMGNNA_DDK IMGDNN Library: ${IMGNNA_DDK_IMGDNN_FILE}")
+  add_library(nna_ddk_imgdnn SHARED IMPORTED GLOBAL)
+  set_property(TARGET nna_ddk_imgdnn PROPERTY IMPORTED_LOCATION ${IMGNNA_DDK_IMGDNN_FILE})
+endif()
+
+
+find_library(IMGNNA_DDK_RUNTIME_FILE NAMES nnasession
+  PATHS ${IMGNNA_DDK_ROOT}/${IMGNNA_LIB_PATH})
+
+if(NOT IMGNNA_DDK_RUNTIME_FILE)
+  message(FATAL_ERROR "Can not find IMGNNA_DDK_RUNTIME_FILE in ${IMGNNA_DDK_ROOT}")
+else()
+  message(STATUS "Found IMGNNA_DDK RUNTIME Library: ${IMGNNA_DDK_RUNTIME_FILE}")
+  add_library(nna_ddk_runtime SHARED IMPORTED GLOBAL)
+  set_property(TARGET nna_ddk_runtime PROPERTY IMPORTED_LOCATION ${IMGNNA_DDK_RUNTIME_FILE})
+endif()
+
+set(nna_runtime_libs nna_ddk_runtime CACHE INTERNAL "imgnna ddk runtime libs")
+set(nna_builder_libs nna_ddk_imgdnn CACHE INTERNAL "imgnna ddk builder libs")
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -118,6 +118,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_NNA)
+    foreach(var ${lite_deps_NNA_DEPS})
+	    set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  if (LITE_WITH_HUAWEI_ASCEND_NPU)
    foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS})
      set(deps ${deps} ${var})
@@ -149,7 +155,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -160,6 +166,7 @@ function(lite_cc_library TARGET)
            CUDA_DEPS ${args_CUDA_DEPS}
            CL_DEPS ${args_CL_DEPS}
            BM_DEPS ${args_BM_DEPS}
+            NNA_DEPS ${args_NNA_DEPS}
            RKNPU_DEPS ${args_RKNPU_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
            CV_DEPS ${args_CV_DEPS}
@@ -200,7 +207,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NNA_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -217,6 +224,7 @@ function(lite_cc_binary TARGET)
            XPU_DEPS ${args_XPU_DEPS}
            RKNPU_DEPS ${args_RKNPU_DEPS}
            BM_DEPS ${args_BM_DEPS}
+            NNA_DEPS ${args_NNA_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
@@ -254,7 +262,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -279,6 +287,7 @@ function(lite_cc_test TARGET)
              XPU_DEPS ${args_XPU_DEPS}
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
+              NNA_DEPS ${args_NNA_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -315,6 +324,7 @@ set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
+set(nna_kernels CACHE INTERNAL "nna kernels")
 set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")
@@ -331,12 +341,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
+# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU NNA)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -448,6 +458,16 @@ function(add_kernel TARGET device level)
        endif()
        set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "NNA")
+        if (NOT LITE_WITH_NNA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(nna_kernels "${nna_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
+
    if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU")
        if (NOT LITE_WITH_HUAWEI_ASCEND_NPU)
            foreach(src ${args_SRCS})
@@ -500,6 +520,7 @@ function(add_kernel TARGET device level)
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
+              NNA_DEPS ${args_NNA_DEPS}
              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -519,7 +540,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NNA_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -557,6 +578,7 @@ function(add_operator TARGET level)
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
+              NNA_DEPS ${args_NNA_DEPS}
              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -14,6 +14,7 @@ message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
 message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
+message(STATUS "LITE_WITH_NNA:\t${LITE_WITH_NNA}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")

@@ -93,6 +94,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
    if (LITE_WITH_RKNPU)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu")
    endif(LITE_WITH_RKNPU)
+    if (LITE_WITH_NNA)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.nna")
+    endif(LITE_WITH_NNA)
 else()
    set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
 endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -40,6 +40,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                  NPU_DEPS ${npu_kernels}
                  APU_DEPS ${apu_kernels}
                  RKNPU_DEPS ${rknpu_kernels}
+                  NNA_DEPS ${nna_kernels}
                  HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
                  )

@@ -85,7 +86,10 @@ else()
            # Need to add RKNPU runtime libs dependency
            target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs})
        endif()
-
+        if (LITE_WITH_NNA)
+            # Need to add IMG NNA runtime libs (libhiai.so) dependency
+            #target_link_libraries(paddle_light_api_shared ${nna_builder_libs} ${nna_runtime_libs})
+        endif()
    endif()
 endif()

@@ -118,6 +122,11 @@ if(LITE_WITH_RKNPU)
    set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
 endif()

+if(LITE_WITH_NNA)
+    set(light_api_deps ${light_api_deps} ${nna_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${nna_deps})
+endif()
+
 if(LITE_WITH_HUAWEI_ASCEND_NPU)
    set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
    set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
@@ -137,6 +146,7 @@ list(LENGTH fpga_kernels num_fpga_kernels)
 list(LENGTH bm_kernels num_bm_kernels)
 list(LENGTH mlu_kernels num_mlu_kernels)
 list(LENGTH huawei_ascend_npu_kernels num_huawei_ascend_npu_kernels)
+list(LENGTH imagination_nna_kernels num_imagination_nna_kernels)

 message(STATUS "Collected ${num_ops} ops")
 message(STATUS "Collected ${num_x86_kernels} X86 kernels")
@@ -152,6 +162,7 @@ message(STATUS "Collected ${num_fpga_kernels} FPGA kernels")
 message(STATUS "Collected ${num_bm_kernels} BM kernels")
 message(STATUS "Collected ${num_mlu_kernels} MLU kernels")
 message(STATUS "Collected ${num_huawei_ascend_npu_kernels} HUAWEI_ASCEND_NPU kernels")
+message(STATUS "Collected ${num_imagination_nna_kernels} IMAGINATION_NNA kernels")

 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -169,6 +180,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        APU_DEPS ${apu_kernels}
                        RKNPU_DEPS ${rknpu_kernels}
                        BM_DEPS ${bm_kernels}
+                        NNA_DEPS ${nna_kernels}
                        CL_DEPS ${opencl_kernels}
                        FPGA_DEPS ${fpga_kernels}
                        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
@@ -195,6 +207,7 @@ lite_cc_library(light_api SRCS light_api.cc
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
+        NNA_DEPS ${nna_kernels}
        MLU_DEPS ${mlu_kernels}
        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})

@@ -219,6 +232,7 @@ if(WITH_TESTING)
           FPGA_DEPS ${fpga_kernels}
           BM_DEPS ${bm_kernels}
           MLU_DEPS ${mlu_kernels}
+           NNA_DEPS ${nna_kernels}
           HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
           EXCLUDE_COMPILE_DEPS "ON"
           ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
@@ -351,6 +365,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
+        NNA_DEPS ${nna_kernels}
        BM_DEPS ${bm_kernels}
        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    # The final inference library for just MobileConfig.
@@ -382,6 +397,7 @@ if(NOT WITH_COVERAGE)
        RKNPU_DEPS ${rknpu_kernels}
        BM_DEPS ${bm_kernels}
        MLU_DEPS ${mlu_kernels}
+        NNA_DEPS ${nna_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
        --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
 endif()
@@ -424,6 +440,7 @@ if(NOT WITH_COVERAGE)
      FPGA_DEPS ${fpga_kernels}
      BM_DEPS ${bm_kernels}
      MLU_DEPS ${mlu_kernels}
+      NNA_DEPS ${nna_kernels}
      HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
      ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
    if (WITH_TESTING)
@@ -444,6 +461,7 @@ if(NOT IOS)
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        RKNPU_DEPS ${rknpu_kernels}
+        NNA_DEPS ${nna_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels}
@@ -460,6 +478,7 @@ if(NOT IOS)
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        RKNPU_DEPS ${rknpu_kernels}
+        NNA_DEPS ${nna_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels}
@@ -476,6 +495,7 @@ if(NOT IOS)
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        RKNPU_DEPS ${rknpu_kernels}
+        NNA_DEPS ${nna_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels}
@@ -486,6 +506,7 @@ if(NOT IOS)
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        NNA_DEPS ${nna_kernels}
        XPU_DEPS ${xpu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
@@ -504,6 +525,7 @@ if(NOT IOS)
        APU_DEPS ${apu_kernels}
        XPU_DEPS ${xpu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
+        NNA_DEPS ${nna_kernels}
        MLU_DEPS ${mlu_kernels}
        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
        CL_DEPS ${opencl_kernels}
@@ -518,6 +540,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        RKNPU_DEPS ${npu_kernels}
+        NNA_DEPS ${nna_kernels}
        XPU_DEPS ${xpu_kernels}
        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -125,6 +125,10 @@ std::vector<Place> ParserValidPlaces() {
    } else if (target_repr == "apu") {
      valid_places.emplace_back(
          Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
+    } else if (target_repr == "nna") {
+      valid_places.emplace_back(TARGET(kNNA));
+      valid_places.emplace_back(
+          Place{TARGET(kNNA), PRECISION(kInt8), DATALAYOUT(kNCHW)});
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "
@@ -204,6 +208,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                      "kRKNPU",
                                      "kAPU",
                                      "kHuaweiAscendNPU",
+                                      "kNNA",
                                      "kAny",
                                      "kUnk"};
  int maximum_optype_length = 0;
@@ -269,16 +274,19 @@ void PrintHelpInfo() {
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
      "        "
-      "`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
+      "`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu|"
+      "nna)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu|nna)"
+      "`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu|nna)"
+      "`"
      "  Display operators in the input model\n";
  std::cout << "opt version:" << opt_version << std::endl
            << help_info << std::endl;

--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -84,6 +84,10 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
    } else if (target_repr == "apu") {
      valid_places_.emplace_back(
          Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
+    } else if (target_repr == "nna") {
+      valid_places.emplace_back(TARGET(kNNA));
+      valid_places.emplace_back(
+          Place{TARGET(kNNA), PRECISION(kInt8), DATALAYOUT(kNCHW)});
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "
@@ -240,7 +244,8 @@ void OptBase::PrintHelpInfo() {
      "default\n"
      "        `set_lite_out(output_optimize_model_dir)`\n"
      "        "
-      "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
+      "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu|"
+      "nna)`\n"
      "        `record_model_info(false|true)`: refer to whether to record ops "
      "info for striping lib, false by default`\n"
      "        `run() : start model transformation`\n"
@@ -277,16 +282,17 @@ void OptBase::PrintExecutableBinHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n"
+      "        "
+      "`--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|nna)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|nna)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|nna)`"
      "  Display operators in the input model\n";
  std::cout << "paddlelite opt version:" << opt_version << std::endl
            << help_info << std::endl;
@@ -305,6 +311,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
                                                     "kRKNPU",
                                                     "kAPU",
                                                     "kHuaweiAscendNPU",
+                                                     "kNNA",
                                                     "kAny",
                                                     "kUnk"};
  // Get the lengh of the first column: maximum length of the op_type

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -81,7 +81,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "mlu",
                                              "rknpu",
                                              "apu",
-                                              "huawei_ascend_npu"};
+                                              "huawei_ascend_npu",
+                                              "nna"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -125,7 +126,8 @@ const std::string& TargetRepr(TargetType target) {
                                              "kMLU",
                                              "kRKNPU",
                                              "kAPU",
-                                              "kHuaweiAscendNPU"};
+                                              "kHuaweiAscendNPU",
+                                              "kNNA"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -171,7 +173,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kAPU),
                                               TARGET(kRKNPU),
                                               TARGET(kFPGA),
-                                               TARGET(kHuaweiAscendNPU)});
+                                               TARGET(kHuaweiAscendNPU),
+                                               TARGET(kNNA)});
  if (target == TARGET(kAny)) {
    return valid_set;
  }

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -58,7 +58,8 @@ enum class TargetType : int {
  kRKNPU = 12,
  kAPU = 13,
  kHuaweiAscendNPU = 14,
-  NUM = 15,  // number of fields.
+  kNNA = 15,
+  NUM = 16,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -53,6 +53,7 @@ USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
+USE_MIR_PASS(nna_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(mlu_subgraph_pass);
 USE_MIR_PASS(mlu_postprocess_pass);

--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -39,17 +39,17 @@ namespace paddle {
 namespace lite {
 namespace pybind {

-using lite_api::Tensor;
+using lite::LightPredictorImpl;
 using lite_api::CxxConfig;
-using lite_api::MobileConfig;
-using lite_api::PowerMode;
-using lite_api::TargetType;
-using lite_api::PrecisionType;
 using lite_api::DataLayoutType;
-using lite_api::Place;
 using lite_api::MLUCoreVersion;
-using lite::LightPredictorImpl;
+using lite_api::MobileConfig;
 using lite_api::OptBase;
+using lite_api::Place;
+using lite_api::PowerMode;
+using lite_api::PrecisionType;
+using lite_api::TargetType;
+using lite_api::Tensor;

 #ifndef LITE_ON_TINY_PUBLISH
 using lite::CxxPaddleApiImpl;
@@ -192,6 +192,7 @@ void BindLitePlace(py::module *m) {
      .value("RKNPU", TargetType::kRKNPU)
      .value("APU", TargetType::kAPU)
      .value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
+      .value("NNA", TargetType::kNNA)
      .value("Any", TargetType::kAny);

  // PrecisionType

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -11,3 +11,4 @@ add_subdirectory(bm)
 add_subdirectory(apu)
 add_subdirectory(rknpu)
 add_subdirectory(huawei_ascend_npu)
+add_subdirectory(nna)
--- a/lite/backends/nna/CMakeLists.txt
+++ b/lite/backends/nna/CMakeLists.txt
+if(NOT LITE_WITH_NNA)
+  return()
+endif()
+
+lite_cc_library(device_nna SRCS imgdnn_manager.cc DEPS ${nna_builder_libs} ${nna_runtime_libs})
--- a/lite/backends/nna/imgdnn_manager.cc
+++ b/lite/backends/nna/imgdnn_manager.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "imgdnn_manager.h"  // NOLINT
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace nna {
+
+static void err_callback(imgdnn_report_flags flags,
+                         const char **tensor_names,
+                         int num_tensor_names,
+                         imgdnn_err_code error_code,
+                         const char *error_message) {
+  std::string msg_prefix;
+  switch (flags) {
+    case imgdnn_report_flags::IMGDNN_REPORT_ERROR:
+      msg_prefix = "ERROR";
+      break;
+    case imgdnn_report_flags::IMGDNN_REPORT_VERBOSE:
+      msg_prefix = "VERBOSE";
+      break;
+    case imgdnn_report_flags::IMGDNN_REPORT_INFO:
+      msg_prefix = "INFO";
+      break;
+    case imgdnn_report_flags::IMGDNN_REPORT_WARNING:
+      msg_prefix = "WARNING";
+      break;
+    default:
+      std::cerr << "unknown report flag in error callback" << std::endl;
+  }
+
+  std::cerr << msg_prefix << ": " << error_message << std::endl;
+}
+
+ImgdnnManager::ImgdnnManager() {
+  err_ = imgdnnSetErrorHandler(err_callback);
+  net_ = imgdnnCreateNetwork(&err_);
+  ASSERT(err_ != IMGDNN_SUCCESS, "CreateNetwork failed!");
+
+  unsigned int num_devices;
+  err_ = imgdnnGetDevices(
+      IMGDNN_DEVICE_TYPE_ACCELERATOR, 1, &device_, &num_devices);
+  ASSERT(err_ != IMGDNN_SUCCESS, "GetDevices failed!");
+  context_ = imgdnnCreateContext(num_devices, &device_, 0, &err_);
+  ASSERT(err_ != IMGDNN_SUCCESS, "CreateContext failed!");
+  binding_ = imgdnnCreateBinding(&err_);
+  ASSERT(err_ != IMGDNN_SUCCESS, "CreateBinding failed!");
+}
+
+imgdnn_tensor ImgdnnManager::createConvolutionLayer(
+    imgdnn_tensor input_tensor,
+    imgdnn_tensor weights_tensor,
+    imgdnn_tensor bias_tensor,
+    imgdnn_quant_param dst_quant_param,
+    unsigned int stride[2],
+    unsigned int pad_begin[2],
+    unsigned int pad_end[2],
+    unsigned int dilation[2],
+    bool use_dwconv) {
+  imgdnn_tensor convw_tensor;
+  if (use_dwconv) {
+    // transpose weight
+    int order[4] = {1, 0, 2, 3};
+    imgdnn_tensor transport_weights =
+        imgdnnNetworkTransposeOp(net_, weights_tensor, order, &err_);
+    convw_tensor = imgdnnNetworkDepthConvolution2dOp_v2(net_,
+                                                        input_tensor,
+                                                        transport_weights,
+                                                        stride,
+                                                        pad_begin,
+                                                        pad_end,
+                                                        dilation,
+                                                        &err_);
+  } else {
+    convw_tensor = imgdnnNetworkConvolution2dOp_v2(net_,
+                                                   input_tensor,
+                                                   weights_tensor,
+                                                   stride,
+                                                   pad_begin,
+                                                   pad_end,
+                                                   dilation,
+                                                   &err_);
+  }
+
+  // debug
+  imgdnn_tensor_descriptor desc_1;
+  imgdnnGetTensorDescriptor(input_tensor, &desc_1);
+  imgdnnGetTensorDescriptor(weights_tensor, &desc_1);
+  imgdnnGetTensorDescriptor(convw_tensor, &desc_1);
+
+  imgdnn_tensor conv2d_tensor;
+  if (bias_tensor) {
+    imgdnn_tensor convw_int_tensor = imgdnnNetworkCastOp(
+        net_, convw_tensor, IMGDNN_TYPE_I32, nullptr, &err_);
+
+    imgdnn_tensor_descriptor bias_desc;
+    imgdnnGetTensorDescriptor(convw_tensor, &bias_desc);
+
+    imgdnn_tensor broadcast2_tensor;
+    broadcast2_tensor = imgdnnNetworkBroadcastOp(
+        net_, bias_tensor, 2, bias_desc.size[2], &err_);
+
+    imgdnn_tensor broadcast3_tensor;
+    broadcast3_tensor = imgdnnNetworkBroadcastOp(
+        net_, broadcast2_tensor, 3, bias_desc.size[3], &err_);
+
+    conv2d_tensor = imgdnnNetworkBinaryOp(
+        net_, convw_int_tensor, broadcast3_tensor, IMGDNN_OPERATION_ADD, &err_);
+  } else {
+    conv2d_tensor = convw_tensor;
+  }
+
+  imgdnn_tensor conv2d_out_tensor;
+  imgdnn_tensor_descriptor desc;
+  imgdnnGetTensorDescriptor(input_tensor, &desc);
+  if (desc.type == IMGDNN_TYPE_Q_I8 || desc.type == IMGDNN_TYPE_Q_U8) {
+    conv2d_out_tensor = imgdnnNetworkCastOp(
+        net_, conv2d_tensor, desc.type, &dst_quant_param, &err_);
+  }
+
+  return conv2d_out_tensor;
+}
+
+imgdnn_tensor ImgdnnManager::createBatchNormLayer(imgdnn_tensor input_tensor,
+                                                  const void *const avg_in,
+                                                  const void *const var_in,
+                                                  const float eps) {
+  imgdnn_tensor bna_tensor;
+  imgdnn_tensor average_tensor;
+  imgdnn_tensor_descriptor av_desc;
+
+  imgdnn_tensor broadcast2_tensor;
+  imgdnn_tensor broadcast3_tensor;
+
+  unsigned int buffer_size;
+
+  imgdnn_tensor_descriptor in_desc;
+  imgdnnGetTensorDescriptor(input_tensor, &in_desc);
+
+  av_desc.dimensions = 2;
+  av_desc.type = in_desc.type;
+  av_desc.size[0] = in_desc.size[0];
+  av_desc.size[1] = in_desc.size[1];
+
+  average_tensor = createFixedInputTensor(&av_desc, avg_in, true);
+
+  broadcast2_tensor =
+      imgdnnNetworkBroadcastOp(net_, average_tensor, 2, in_desc.size[2], &err_);
+
+  broadcast3_tensor = imgdnnNetworkBroadcastOp(
+      net_, broadcast2_tensor, 3, in_desc.size[3], &err_);
+
+  bna_tensor = imgdnnNetworkBinaryOp(
+      net_, input_tensor, broadcast3_tensor, IMGDNN_OPERATION_SUB, &err_);
+
+  imgdnn_tensor variance_tensor;
+  imgdnn_tensor_descriptor va_desc;
+
+  va_desc.dimensions = 2;
+  va_desc.type = in_desc.type;
+  va_desc.size[0] = in_desc.size[0];
+  va_desc.size[1] = in_desc.size[1];
+
+  buffer_size = imgdnnGetDescriptorSize(&va_desc, &err_);
+  float *variance = reinterpret_cast<float *>(GetBufromPool(buffer_size));
+  memcpy(variance, var_in, buffer_size);
+  // Perform 1/sqrt(var+eps) and Update var_data.
+  buffer_size /= sizeof(float);
+  for (size_t i = 0; i < buffer_size; i++) {
+    variance[i] = 1.0 / (sqrt(variance[i] + eps));
+  }
+  variance_tensor = createFixedInputTensor(&va_desc, variance, false);
+
+  broadcast2_tensor = imgdnnNetworkBroadcastOp(
+      net_, variance_tensor, 2, in_desc.size[2], &err_);
+
+  broadcast3_tensor = imgdnnNetworkBroadcastOp(
+      net_, broadcast2_tensor, 3, in_desc.size[3], &err_);
+
+  imgdnn_tensor bn_tensor;
+  bn_tensor = imgdnnNetworkBinaryOp(
+      net_, bna_tensor, broadcast3_tensor, IMGDNN_OPERATION_MUL, &err_);
+
+  return bn_tensor;
+}
+
+imgdnn_tensor ImgdnnManager::createPoolingLayer(
+    imgdnn_tensor in_tensor,
+    imgdnn_quant_param dst_quant_param,
+    const unsigned int size[2],
+    const unsigned int stride[2],
+    const unsigned int pad_to_begin[2],
+    const unsigned int pad_to_end[2],
+    imgdnn_pooling_type type) {
+  // debug
+  imgdnn_tensor_descriptor desc_1;
+  imgdnnGetTensorDescriptor(in_tensor, &desc_1);
+
+  imgdnn_tensor pool_tensor = imgdnnNetworkPooling2dOp_v2(
+      net_, in_tensor, size, stride, pad_to_begin, pad_to_end, type, &err_);
+  // debug
+  imgdnnGetTensorDescriptor(pool_tensor, &desc_1);
+
+  imgdnn_tensor_descriptor desc;
+  imgdnnGetTensorDescriptor(in_tensor, &desc);
+  if (desc.type == IMGDNN_TYPE_Q_I8 || desc.type == IMGDNN_TYPE_Q_U8) {
+    pool_tensor = imgdnnNetworkCastOp(
+        net_, pool_tensor, desc.type, &dst_quant_param, &err_);
+  }
+
+  return pool_tensor;
+}
+
+imgdnn_tensor ImgdnnManager::createFullyConnectedLayer(
+    imgdnn_tensor input_tensor,
+    imgdnn_tensor weights_tensor,
+    imgdnn_tensor bias_tensor,
+    imgdnn_quant_param dst_quant_param) {
+  imgdnn_tensor fcw_tensor;
+  imgdnn_tensor fcb_tensor;
+
+  imgdnn_tensor_descriptor in_desc;
+  imgdnnGetTensorDescriptor(input_tensor, &in_desc);
+
+  // int flatten_dim = 1
+  for (unsigned i = 2; i < in_desc.dimensions; ++i)
+    in_desc.size[1] *= in_desc.size[i];
+  in_desc.dimensions = 2;
+
+  auto reshaped_input =
+      imgdnnNetworkReshapeOp(net_, input_tensor, &in_desc, &err_);
+
+  // debug
+  imgdnn_tensor_descriptor desc_1;
+  imgdnnGetTensorDescriptor(reshaped_input, &desc_1);
+  imgdnn_tensor_descriptor desc_2;
+  imgdnnGetTensorDescriptor(weights_tensor, &desc_2);
+  imgdnn_tensor_descriptor desc_3;
+  imgdnnGetTensorDescriptor(bias_tensor, &desc_3);
+
+  // handle weights [num_units, input_size] tensor
+  /* const int order[] = { 1, 0 };
+  auto isnu_weights_tensor = imgdnnNetworkTransposeOp(net,
+                                                      weights_tensor,
+                                                      order,
+                                                      &err_);*/
+
+  fcw_tensor = imgdnnNetworkBinaryOp(
+      net_, reshaped_input, weights_tensor, IMGDNN_OPERATION_MATMUL, &err_);
+
+  if (bias_tensor) {
+    imgdnn_tensor fcw_int_tensor =
+        imgdnnNetworkCastOp(net_, fcw_tensor, IMGDNN_TYPE_I32, nullptr, &err_);
+
+    imgdnn_tensor_descriptor desc_4;
+    imgdnnGetTensorDescriptor(fcw_int_tensor, &desc_4);
+
+    fcb_tensor = imgdnnNetworkBinaryOp(
+        net_, fcw_int_tensor, bias_tensor, IMGDNN_OPERATION_ADD, &err_);
+  } else {
+    fcb_tensor = fcw_tensor;
+  }
+
+  imgdnn_tensor_descriptor desc;
+  imgdnnGetTensorDescriptor(input_tensor, &desc);
+  if (desc.type == IMGDNN_TYPE_Q_I8 || desc.type == IMGDNN_TYPE_Q_U8) {
+    fcb_tensor = imgdnnNetworkCastOp(
+        net_, fcb_tensor, desc.type, &dst_quant_param, &err_);
+  }
+
+  return fcb_tensor;
+}
+
+imgdnn_tensor ImgdnnManager::createSoftmaxLayer(
+    imgdnn_tensor input_tensor,
+    float beta,
+    unsigned int axis,
+    imgdnn_quant_param dst_quant_param) {
+  // debug
+  imgdnn_tensor_descriptor desc_1;
+  imgdnnGetTensorDescriptor(input_tensor, &desc_1);
+
+  imgdnn_tensor softmax_tensor =
+      imgdnnNetworkSoftmaxOp(net_, input_tensor, beta, axis, &err_);
+  imgdnn_tensor_descriptor desc;
+  imgdnnGetTensorDescriptor(input_tensor, &desc);
+  if (desc.type == IMGDNN_TYPE_Q_I8 || desc.type == IMGDNN_TYPE_Q_U8) {
+    softmax_tensor = imgdnnNetworkCastOp(
+        net_, softmax_tensor, desc.type, &dst_quant_param, &err_);
+  }
+
+  imgdnn_tensor_descriptor desc_2;
+  imgdnnGetTensorDescriptor(softmax_tensor, &desc_2);
+
+  return softmax_tensor;
+}
+
+imgdnn_tensor ImgdnnManager::createScaleLayer(imgdnn_tensor input_tensor,
+                                              bool with_biasscale,
+                                              const void *const scale,
+                                              const void *const bias) {
+  imgdnn_tensor sc_tensor;
+  imgdnn_tensor scale_tensor;
+  imgdnn_tensor_descriptor sc_desc;
+
+  imgdnn_tensor broadcast2_tensor;
+  imgdnn_tensor broadcast3_tensor;
+
+  unsigned int buffer_size;
+
+  imgdnn_tensor_descriptor in_desc;
+  imgdnnGetTensorDescriptor(input_tensor, &in_desc);
+
+  sc_desc.dimensions = 2;
+  sc_desc.type = in_desc.type;
+  sc_desc.size[0] = in_desc.size[0];
+  sc_desc.size[1] = in_desc.size[1];
+
+  scale_tensor = createFixedInputTensor(&sc_desc, scale, true);
+
+  broadcast2_tensor =
+      imgdnnNetworkBroadcastOp(net_, scale_tensor, 2, in_desc.size[2], &err_);
+
+  broadcast3_tensor = imgdnnNetworkBroadcastOp(
+      net_, broadcast2_tensor, 3, in_desc.size[3], &err_);
+
+  sc_tensor = imgdnnNetworkBinaryOp(
+      net_, input_tensor, broadcast3_tensor, IMGDNN_OPERATION_MUL, &err_);
+
+  if (with_biasscale) {
+    imgdnn_tensor bsc_tensor;
+    imgdnn_tensor biasscale_tensor;
+
+    biasscale_tensor = createFixedInputTensor(&sc_desc, bias, true);
+
+    broadcast2_tensor = imgdnnNetworkBroadcastOp(
+        net_, biasscale_tensor, 2, in_desc.size[2], &err_);
+
+    broadcast3_tensor = imgdnnNetworkBroadcastOp(
+        net_, broadcast2_tensor, 3, in_desc.size[3], &err_);
+
+    bsc_tensor = imgdnnNetworkBinaryOp(
+        net_, sc_tensor, broadcast3_tensor, IMGDNN_OPERATION_ADD, &err_);
+    return bsc_tensor;
+  } else {
+    return sc_tensor;
+  }
+}
+
+imgdnn_network_object ImgdnnManager::createNetworkObject(
+    unsigned int num_inputs,
+    imgdnn_tensor *inputs,
+    unsigned int num_outputs,
+    imgdnn_tensor *outputs) {
+  const imgdnn_network_object_flags flags = 0;
+
+  std::string options_str;
+  std::string ddk_root{"/home/jasonwang/imgtools/ndk/main/"};
+  std::string hwconfig =
+      ddk_root + "nna-tools/config/mirage_hw_config06_23_2_6500_301.json";
+  std::string mapconfig = ddk_root + "nna-tools/config/mapconfig_q8a.json";
+  options_str += "-h " + hwconfig;
+  options_str += " -m " + mapconfig;
+  // options_str += " --dump_debug_binaries enabled";
+
+  net_obj_ = imgdnnCreateNetworkObject(device_,
+                                       context_,
+                                       net_,
+                                       num_inputs,
+                                       inputs,
+                                       num_outputs,
+                                       outputs,
+                                       flags,
+                                       options_str.c_str(),
+                                       &err_);
+  ASSERT(err_ != IMGDNN_SUCCESS, "CreateNetworkObject failed!");
+  return net_obj_;
+}
+
+}  // namespace nna
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/nna/imgdnn_manager.h
+++ b/lite/backends/nna/imgdnn_manager.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "imgdnn.h"  // NOLINT
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace nna {
+
+static inline void CheckAndPrint(bool cond,
+                                 const char *msg,
+                                 int line,
+                                 const char *filename) {
+  if (cond) {
+    std::stringstream err_msg;
+    err_msg << "ERROR: " << msg << "\n";
+    err_msg << "Violated condition at line " << line << " in " << filename;
+    std::cerr << err_msg.str() << "\n";
+    exit(EXIT_FAILURE);
+  }
+}
+
+#define ASSERT(statement, msg) \
+  lite::nna::CheckAndPrint(statement, msg, __LINE__, __FILE__)
+
+class ImgdnnManager {
+  imgdnn_err_code err_;
+  imgdnn_device device_;
+  imgdnn_network net_{nullptr};
+  imgdnn_context context_{nullptr};
+  imgdnn_binding binding_{nullptr};
+  imgdnn_network_object net_obj_{nullptr};
+
+  std::vector<uint8_t *> coef_pool;
+
+ public:
+  ImgdnnManager();
+
+  virtual ~ImgdnnManager() {
+    std::cout << "~ImgdnnManager called" << std::endl;
+    if (net_obj_) err_ = imgdnnNetworkObjectDestroy(net_obj_);
+    if (context_) err_ = imgdnnContextDestroy(context_);
+    if (binding_) err_ = imgdnnBindingDestroy(binding_);
+    if (net_) err_ = imgdnnNetworkDestroy(net_);
+
+    for (auto buf : coef_pool) delete[] buf;
+  }
+
+  uint8_t *GetBufromPool(size_t size) {
+    uint8_t *buf = new uint8_t[size];
+    coef_pool.push_back(buf);
+    return buf;
+  }
+
+  imgdnn_network GetNetwork() { return net_; }
+
+  imgdnn_tensor createInputTensor(imgdnn_tensor_descriptor *desc) {
+    return imgdnnNetworkInput(net_, desc, &err_);
+  }
+
+  imgdnn_tensor createFixedInputTensor(imgdnn_tensor_descriptor *desc,
+                                       const void *const fixed_data,
+                                       bool mem_copy) {
+    imgdnn_tensor fixed_input;
+    if (mem_copy) {
+      size_t buffer_size = imgdnnGetDescriptorSize(desc, &err_);
+      void *buf = GetBufromPool(buffer_size);
+      memcpy(buf, fixed_data, buffer_size);
+      fixed_input = imgdnnNetworkFixedInput(net_, desc, buf, &err_);
+    } else {
+      fixed_input = imgdnnNetworkFixedInput(net_, desc, fixed_data, &err_);
+    }
+    return fixed_input;
+  }
+
+  imgdnn_tensor createConvolutionLayer(imgdnn_tensor input_tensor,
+                                       imgdnn_tensor weights_tensor,
+                                       imgdnn_tensor bias_tensor,
+                                       imgdnn_quant_param dst_quant_param,
+                                       unsigned int stride[2],
+                                       unsigned int pad_begin[2],
+                                       unsigned int pad_end[2],
+                                       unsigned int dilation[2],
+                                       bool use_dwconv = false);
+  imgdnn_tensor createBatchNormLayer(imgdnn_tensor input_tensor,
+                                     const void *const avg_in,
+                                     const void *const var_in,
+                                     const float eps);
+  imgdnn_tensor createPoolingLayer(imgdnn_tensor in_tensor,
+                                   imgdnn_quant_param dst_quant_param,
+                                   const unsigned int size[2],
+                                   const unsigned int stride[2],
+                                   const unsigned int pad_to_begin[2],
+                                   const unsigned int pad_to_end[2],
+                                   imgdnn_pooling_type type);
+  imgdnn_tensor createFullyConnectedLayer(imgdnn_tensor input_tensor,
+                                          imgdnn_tensor weights_tensor,
+                                          imgdnn_tensor bias_tensor,
+                                          imgdnn_quant_param dst_quant_param);
+  imgdnn_tensor createSoftmaxLayer(imgdnn_tensor in_tensor,
+                                   float beta,
+                                   unsigned int axis,
+                                   imgdnn_quant_param dst_quant_param);
+  imgdnn_tensor createScaleLayer(imgdnn_tensor input_tensor,
+                                 bool with_biasscale,
+                                 const void *const scale,
+                                 const void *const bias);
+
+  imgdnn_tensor createReLULayer(imgdnn_tensor in_tensor,
+                                bool has_min_clamp,
+                                float min_clamp,
+                                bool has_max_clamp,
+                                float max_clamp,
+                                float negative_slope) {
+    imgdnn_tensor relu_tensor = imgdnnNetworkReLUOp(net_,
+                                                    in_tensor,
+                                                    has_min_clamp,
+                                                    min_clamp,
+                                                    has_max_clamp,
+                                                    max_clamp,
+                                                    negative_slope,
+                                                    &err_);
+    ASSERT(err_ != IMGDNN_SUCCESS, "ReLU OP fails");
+
+    imgdnn_tensor_descriptor in_desc, relu_desc;
+    imgdnnGetTensorDescriptor(in_tensor, &in_desc);
+    imgdnnGetTensorDescriptor(relu_tensor, &relu_desc);
+    if (relu_desc.type != in_desc.type) {
+      relu_tensor = imgdnnNetworkCastOp(
+          net_, relu_tensor, in_desc.type, &in_desc.quant_param, &err_);
+      ASSERT(err_ != IMGDNN_SUCCESS, "ReLU cast fails");
+    }
+
+    return relu_tensor;
+  }
+
+  imgdnn_network_object createNetworkObject(unsigned int num_inputs,
+                                            imgdnn_tensor *inputs,
+                                            unsigned int num_outputs,
+                                            imgdnn_tensor *outputs);
+
+  imgdnn_memory importMemory(
+      void *memory,
+      size_t size,
+      imgdnn_import_mem_type import_mem_type = IMGDNN_IMPORT_MEM_TYPE_CPU) {
+    imgdnn_memory mem =
+        imgdnnImportMemory(context_, memory, size, import_mem_type, &err_);
+    ASSERT(err_ != IMGDNN_SUCCESS, "ImportMemory fails");
+    return mem;
+  }
+
+  imgdnn_memory allocateMemory(size_t size) {
+    imgdnn_memory mem = imgdnnAllocateMemory(context_, size, &err_);
+    ASSERT(err_ != IMGDNN_SUCCESS, "AllocateMemory fails");
+    return mem;
+  }
+
+  void destroyMemory(imgdnn_memory memory) {
+    err_ = imgdnnMemoryDestroy(memory);
+    ASSERT(err_ != IMGDNN_SUCCESS, "MemoryDestroy fails");
+  }
+
+  void *lockMemory(imgdnn_memory memory, imgdnn_lock_access lock_access) {
+    void *mem = imgdnnMemoryLock(memory, lock_access, &err_);
+    ASSERT(err_ != IMGDNN_SUCCESS, "MemoryLock fails");
+    return mem;
+  }
+
+  void unlockMemory(imgdnn_memory memory) {
+    err_ = imgdnnMemoryUnlock(memory);
+    ASSERT(err_ != IMGDNN_SUCCESS, "MemoryUnLock fails");
+  }
+
+  void getNetworkObjectInputs(unsigned int max_inputs,
+                              imgdnn_input inputs[],
+                              unsigned int *num_inputs) {
+    ASSERT(net_obj_ == nullptr, "NetworkObject NULL when get its inputs");
+    err_ =
+        imgdnnNetworkObjectGetInputs(net_obj_, max_inputs, inputs, num_inputs);
+    ASSERT(err_ != IMGDNN_SUCCESS, "NetworkObjectGetInputs failed!");
+  }
+
+  void getNetworkObjectOutputs(unsigned int max_outputs,
+                               imgdnn_output outputs[],
+                               unsigned int *num_outputs) {
+    ASSERT(net_obj_ == nullptr, "NetworkObject NULL when get its outputs");
+    err_ = imgdnnNetworkObjectGetOutputs(
+        net_obj_, max_outputs, outputs, num_outputs);
+    ASSERT(err_ != IMGDNN_SUCCESS, "NetworkObjectGetOutputs failed!");
+  }
+
+  imgdnn_tensor_descriptor getInputDescriptor(imgdnn_input input) {
+    imgdnn_tensor_descriptor desc = imgdnnGetInputDescriptor(input, &err_);
+    ASSERT(err_ != IMGDNN_SUCCESS, "GetInputDescriptors failed!");
+    return desc;
+  }
+
+  imgdnn_tensor_descriptor getOutputDescriptor(imgdnn_output output) {
+    imgdnn_tensor_descriptor desc = imgdnnGetOutputDescriptor(output, &err_);
+    ASSERT(err_ != IMGDNN_SUCCESS, "GetOutputDescriptors failed!");
+    return desc;
+  }
+
+  size_t getDescriptorSize(const imgdnn_tensor_descriptor *const descriptor) {
+    size_t size = imgdnnGetDescriptorSize(descriptor, &err_);
+    ASSERT(err_ != IMGDNN_SUCCESS, "GetDescriptorSize failed!");
+    return size;
+  }
+
+  void addBindingInput(imgdnn_input input, imgdnn_memory memory) {
+    err_ = imgdnnBindingAddInput(binding_, input, memory);
+    ASSERT(err_ != IMGDNN_SUCCESS, "BindingAddInput failed!");
+  }
+
+  void addBindingOutput(imgdnn_output output, imgdnn_memory memory) {
+    err_ = imgdnnBindingAddOutput(binding_, output, memory);
+    ASSERT(err_ != IMGDNN_SUCCESS, "BindingAddOutput failed!");
+  }
+
+  void executeNetworkObject(bool blocking_execute,
+                            unsigned int num_events_in_wait_list,
+                            const imgdnn_event event_wait_list[],
+                            imgdnn_event *event) {
+    err_ = imgdnnNetworkObjectExecute(net_obj_,
+                                      binding_,
+                                      blocking_execute,
+                                      num_events_in_wait_list,
+                                      event_wait_list,
+                                      event);
+    ASSERT(err_ != IMGDNN_SUCCESS, "NetworkObjectExecute failed!");
+  }
+};
+
+}  // namespace nna
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)

 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${nna_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -64,6 +64,7 @@ using BMContext = Context<TargetType::kBM>;
 using MLUContext = Context<TargetType::kMLU>;
 using RKNPUContext = Context<TargetType::kRKNPU>;
 using HuaweiAscendNPUContext = Context<TargetType::kHuaweiAscendNPU>;
+using NNAContext = Context<TargetType::kNNA>;

 template <>
 class Context<TargetType::kHost> {
@@ -173,6 +174,21 @@ class Context<TargetType::kRKNPU> {
 };
 #endif

+#ifdef LITE_WITH_NNA
+template <>
+class Context<TargetType::kNNA> {
+ public:
+  Context() {}
+  explicit Context(const NNAContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(NNAContext* ctx) {}
+
+  // NNAContext& operator=(const NNAContext& ctx) {}
+  std::string name() const { return "NNAContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_XPU
 template <>
 class Context<TargetType::kXPU> {
@@ -471,6 +487,12 @@ class ContextScheduler {
            &ctx->As<BMContext>());
        break;
 #endif
+#ifdef LITE_WITH_NNA
+      case TARGET(kNNA):
+        kernel_contexts_[TargetType::kNNA].As<NNAContext>().CopySharedTo(
+            &ctx->As<NNAContext>());
+        break;
+#endif
 #ifdef LITE_WITH_MLU
      case TARGET(kMLU): {
        int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
@@ -533,6 +555,9 @@ class ContextScheduler {
 #endif
 #ifdef LITE_WITH_MLU
    InitContext<TargetType::kMLU, MLUContext>();
+#endif
+#ifdef LITE_WITH_NNA
+    InitContext<TargetType::kNNA, NNAContext>();
 #endif
  }


--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -52,21 +52,21 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
                                            "feed",
                                            "fetch"};

-  auto insert_invalid_op_nodes_for_specific_target = [&](
-      std::set<std::string> op_node_set, TargetType specific_target) {
-    std::set<std::string> invalid_op_nodes_opencl = {"layout", "fc"};
-    for (auto& op_node : graph->StmtTopologicalOrder()) {
-      if (!op_node->IsStmt()) continue;
-      TargetType op_target_type = op_node->AsStmt().place().target;
-      if (op_target_type == specific_target &&
-          specific_target == TARGET(kOpenCL)) {
-        invalid_op_nodes.insert(invalid_op_nodes_opencl.begin(),
-                                invalid_op_nodes_opencl.end());
-        break;
-      }
-      // else if // you can add more targets
-    }
-  };
+  auto insert_invalid_op_nodes_for_specific_target =
+      [&](std::set<std::string> op_node_set, TargetType specific_target) {
+        std::set<std::string> invalid_op_nodes_opencl = {"layout", "fc"};
+        for (auto& op_node : graph->StmtTopologicalOrder()) {
+          if (!op_node->IsStmt()) continue;
+          TargetType op_target_type = op_node->AsStmt().place().target;
+          if (op_target_type == specific_target &&
+              specific_target == TARGET(kOpenCL)) {
+            invalid_op_nodes.insert(invalid_op_nodes_opencl.begin(),
+                                    invalid_op_nodes_opencl.end());
+            break;
+          }
+          // else if // you can add more targets
+        }
+      };

  VLOG(4) << "invalid_op_nodes.size();" << invalid_op_nodes.size();
  insert_invalid_op_nodes_for_specific_target(invalid_op_nodes,
@@ -315,4 +315,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
                     TARGET(kRKNPU),
                     TARGET(kAPU),
                     TARGET(kMLU),
-                     TARGET(kHuaweiAscendNPU)});
+                     TARGET(kHuaweiAscendNPU),
+                     TARGET(kNNA)});
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -128,6 +128,20 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }

+void NNASubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/nna/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
@@ -147,3 +161,5 @@ REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass)
    .BindTargets({TARGET(kRKNPU)});
 REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
    .BindTargets({TARGET(kMLU)});
+REGISTER_MIR_PASS(nna_subgraph_pass, paddle::lite::mir::NNASubgraphPass)
+    .BindTargets({TARGET(kNNA)});
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -57,6 +57,11 @@ class MLUSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };

+class NNASubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -126,6 +126,7 @@ class Optimizer {
                                                     // of the quantized ops.
          "npu_subgraph_pass",
          "huawei_ascend_npu_subgraph_pass",
+          "imagination_nna_subgraph_pass",
          "xpu_subgraph_pass",
          "bm_subgraph_pass",
          "apu_subgraph_pass",

--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -17,6 +17,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
        NPU_DEPS ${npu_kernels}
        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
+        NNA_DEPS ${nna_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -47,6 +48,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
    NPU_DEPS ${npu_kernels}
    HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
    RKNPU_DEPS ${rknpu_kernels}
+    NNA_DEPS ${nna_kernels}
    XPU_DEPS ${xpu_kernels}
    CL_DEPS ${opencl_kernels}
    FPGA_DEPS ${fpga_kernels}

--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -15,3 +15,4 @@ add_subdirectory(apu)
 add_subdirectory(bm)
 add_subdirectory(rknpu)
 add_subdirectory(huawei_ascend_npu)
+add_subdirectory(nna)
--- a/lite/kernels/nna/CMakeLists.txt
+++ b/lite/kernels/nna/CMakeLists.txt
+add_subdirectory(bridges)
+
+add_kernel(subgraph_compute_nna NNA basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_nna subgraph_bridge_engine ${nna_subgraph_bridges})
--- a/lite/kernels/nna/bridges/CMakeLists.txt
+++ b/lite/kernels/nna/bridges/CMakeLists.txt
+if(NOT LITE_WITH_NNA)
+  return()
+endif()
+
+lite_cc_library(subgraph_bridge_utility_nna SRCS utility.cc DEPS ${nna_builder_libs} ${nna_runtime_libs} tensor)
+lite_cc_library(subgraph_bridge_graph_nna SRCS graph.cc DEPS subgraph_bridge_utility_nna)
+
+set(nna_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_nna subgraph_bridge_graph_nna)
+
+lite_cc_library(subgraph_bridge_fc_op_nna SRCS fc_op.cc DEPS ${nna_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_op_nna SRCS conv_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_matmul_op_nna SRCS matmul_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_mul_op_nna SRCS mul_op.cc DEPS ${nna_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_act_op_nna SRCS act_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_scale_op_nna SRCS scale_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_softmax_op_nna SRCS softmax_op.cc DEPS ${nna_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_nna SRCS pool_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_batch_norm_op_nna SRCS batch_norm_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_elementwise_ops_nna SRCS elementwise_ops.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_reshape_op_nna SRCS reshape_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_conv_transpose_op_nna SRCS conv_transpose_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_interpolate_op_nna SRCS interpolate_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_transpose_op_nna SRCS transpose_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_split_op_nna SRCS split_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_concat_op_nna SRCS concat_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_shuffle_channel_op_nna SRCS shuffle_channel_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_pad2d_op_nna SRCS pad2d_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_reduce_mean_op_nna SRCS reduce_mean_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_unsqueeze_op_nna SRCS unsqueeze_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_gather_op_nna SRCS gather_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_lookup_table_op_nna SRCS lookup_table_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_argmax_op_nna SRCS argmax_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_instance_norm_op_nna SRCS instance_norm_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_dropout_op_nna SRCS dropout_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_topk_op_nna SRCS topk_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_layer_norm_op_nna SRCS layer_norm_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_fill_constant_op_nna SRCS fill_constant_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_nna SRCS fill_constant_batch_size_like_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_increment_op_nna SRCS increment_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_compare_op_nna SRCS compare_op.cc DEPS ${nna_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_shape_op_nna SRCS shape_op.cc DEPS ${nna_subgraph_bridge_deps})
+
+
+set(nna_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_nna
+        subgraph_bridge_graph_nna
+        subgraph_bridge_fc_op_nna
+        subgraph_bridge_conv_op_nna
+        #subgraph_bridge_matmul_op_nna
+        #subgraph_bridge_mul_op_nna
+        subgraph_bridge_act_op_nna
+        #subgraph_bridge_scale_op_nna
+        #subgraph_bridge_softmax_op_nna
+        subgraph_bridge_pool_op_nna
+        #subgraph_bridge_batch_norm_op_nna
+        #subgraph_bridge_elementwise_ops_nna
+        #subgraph_bridge_reshape_op_nna
+        #subgraph_bridge_conv_transpose_op_nna
+        #subgraph_bridge_interpolate_op_nna
+        #subgraph_bridge_transpose_op_nna
+        #subgraph_bridge_split_op_nna
+        #subgraph_bridge_concat_op_nna
+        #subgraph_bridge_shuffle_channel_op_nna
+        #subgraph_bridge_pad2d_op_nna
+        #subgraph_bridge_reduce_mean_op_nna
+        #subgraph_bridge_unsqueeze_op_nna
+        #subgraph_bridge_gather_op_nna
+        #subgraph_bridge_lookup_table_op_nna
+        #subgraph_bridge_argmax_op_nna
+        #subgraph_bridge_instance_norm_op_nna
+        #subgraph_bridge_dropout_op_nna
+        #subgraph_bridge_topk_op_nna
+        #subgraph_bridge_layer_norm_op_nna
+        #subgraph_bridge_fill_constant_op_nna
+        #subgraph_bridge_fill_constant_batch_size_like_op_nna
+        #subgraph_bridge_increment_op_nna
+        #subgraph_bridge_compare_op_nna
+        CACHE INTERNAL "nna_subgraph_bridges")
+
+message(STATUS "+++++ nna_subgraph_bridges: ${nna_subgraph_bridges}")
--- a/lite/kernels/nna/bridges/act_op.cc
+++ b/lite/kernels/nna/bridges/act_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/nna/bridges/graph.h"
+#include "lite/kernels/nna/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace nna {
+
+// template <typename ActType>
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NNA] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    // x_node = graph->Add(x_name, *x);
+    LOG(WARNING) << "ActConverter:x_node not in graph";
+  }
+
+  imgdnn_tensor relu_output = graph->GetBuilder()->createReLULayer(
+      x_node->data(), true, 0.0, false, 0.0, 0.0);
+
+  imgdnn_tensor_descriptor desc;
+  imgdnn_err_code err = imgdnnGetTensorDescriptor(relu_output, &desc);
+  CHECK(err == IMGDNN_SUCCESS) << "fail get tensor description(RELU)";
+
+  graph->Add(out_name, relu_output, desc.type);
+
+  return SUCCESS;
+}
+
+}  // namespace nna
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+#if 0
+REGISTER_SUBGRAPH_BRIDGE(
+    sigmoid,
+    kNNA,
+    paddle::lite::subgraph::nna::ActConverter<ge::op::Activation>);
+#endif
+REGISTER_SUBGRAPH_BRIDGE(relu, kNNA, paddle::lite::subgraph::nna::ActConverter);
+#if 0
+REGISTER_SUBGRAPH_BRIDGE(
+    tanh, kNNA, paddle::lite::subgraph::nna::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu_clipped,
+    kNNA,
+    paddle::lite::subgraph::nna::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu6, kNNA, paddle::lite::subgraph::nna::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    leaky_relu,
+    kNNA,
+    paddle::lite::subgraph::nna::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    abs, kNNA, paddle::lite::subgraph::nna::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    softsign,
+    kNNA,
+    paddle::lite::subgraph::nna::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    softplus,
+    kNNA,
+    paddle::lite::subgraph::nna::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    hard_sigmoid,
+    kNNA,
+    paddle::lite::subgraph::nna::ActConverter<ge::op::Activation>);
+
+REGISTER_SUBGRAPH_BRIDGE(
+    log, kNNA, paddle::lite::subgraph::nna::ActConverter<ge::op::Log>);
+REGISTER_SUBGRAPH_BRIDGE(
+    square, kNNA, paddle::lite::subgraph::nna::ActConverter<ge::op::Square>);
+REGISTER_SUBGRAPH_BRIDGE(
+    sqrt, kNNA, paddle::lite::subgraph::nna::ActConverter<ge::op::Sqrt>);
+#endif
--- a/lite/kernels/nna/bridges/batch_norm_op.cc
+++ b/lite/kernels/nna/bridges/batch_norm_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/nna/bridges/graph.h"
+#include "lite/kernels/nna/bridges/registry.h"
+#include "lite/kernels/nna/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace nna {
+
+int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NNA] Converting " + op_type + "...";
+
+  // Get innat and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto scale_name = op_info->Input("Scale").front();
+  auto scale = scope->FindMutableTensor(scale_name);
+  auto bias_name = op_info->Input("Bias").front();
+  auto bias = scope->FindMutableTensor(bias_name);
+  auto mean_name = op_info->Input("Mean").front();
+  auto mean = scope->FindMutableTensor(mean_name);
+  auto variance_name = op_info->Input("Variance").front();
+  auto variance = scope->FindMutableTensor(variance_name);
+  auto y_name = op_info->Output("Y").front();
+  // float momentum = op_info->GetAttr<float>("momentum");
+  float epsilon = op_info->GetAttr<float>("epsilon");
+  // int mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
+  /*
+  bool use_global_stats = !op_info->HasAttr("use_global_stats") ||
+                          op_info->GetAttr<bool>("use_global_stats");
+  if (!use_global_stats) {
+    LOG(WARNING) << "[NNA] Only use_global_stats=true is supported by DDK";
+  }
+  */
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    // x_node = graph->Add(x_name, *x);
+    LOG(WARNING) << "BatchNormConverter:x_node not in graph";
+  }
+
+  ConvNetBuilder& builder = graph->GetBuilder();
+  auto bn_out = builder.createBatchNormLayer(x_node->data(),
+                                             mean->mutable_data<float>(),
+                                             variance->mutable_data<float>(),
+                                             epsilon);
+  bn_out = builder.createScaleLayer(
+      bn_out, true, scale->mutable_data<float>(), bias->mutable_data<float>());
+
+  // PrecisionType precision = x->precision();
+  imgdnn_tensor_descriptor desc;
+  imgdnn_err_code err = imgdnnGetTensorDescriptor(bn_out, &desc);
+  CHECK(err == IMGDNN_SUCCESS) << "fail get tensor description(BN)";
+
+  graph->Add(y_name, bn_out, desc.type);
+
+  return SUCCESS;
+}
+
+}  // namespace nna
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kNNA,
+                         paddle::lite::subgraph::nna::BatchNormConverter);
--- a/lite/kernels/nna/bridges/conv_op.cc
+++ b/lite/kernels/nna/bridges/conv_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include "lite/kernels/nna/bridges/graph.h"
+#include "lite/kernels/nna/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace nna {
+
+int ConvConverter(void *ctx, OpLite *op, KernelBase *kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph *>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NNA] Converting " << op_type << "... ";
+
+  // Get input and output vars and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+
+  auto output_name = op_info->Output("Output").front();
+  auto output = scope->FindMutableTensor(output_name);
+  auto output_dims = output->dims();
+
+  auto bs = input_dims[0];
+  auto ic = input_dims[1];
+  auto oc = filter_dims[0];
+  CHECK_EQ(input_dims.size(), 4L);
+  CHECK_EQ(output_dims.size(), 4L);
+  CHECK_EQ(filter_dims.size(), 4L);
+  CHECK_EQ(output_dims[0], bs);
+  CHECK_EQ(output_dims[1], oc);
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  bool with_act =
+      op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
+  std::string act_type =
+      with_act ? op_info->GetAttr<std::string>("act_type") : "";
+  float leaky_relu_alpha = act_type == "leaky_relu"
+                               ? op_info->GetAttr<float>("leaky_relu_alpha")
+                               : 0.f;
+
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  std::vector<float> weight_scale;
+  TensorInfo qnt;
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    output_scale = op_info->GetAttr<float>("output_scale");
+    weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+  }
+
+  // Input node
+  std::shared_ptr<Node> input_node = nullptr;
+  imgdnn_tensor in_tensor;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+    in_tensor = input_node->data();
+  } else {
+    TensorInfoReset(&qnt);
+    if (enable_int8)
+      qnt.type = IMGDNN_TYPE_Q_U8;
+    else
+      qnt.type = IMGDNN_TYPE_F32;
+    qnt.scales.push_back(input_scale);
+    qnt.zero_points.push_back(128);
+    input_node = graph->Add(input_name, *input, qnt, Node::Role::kInput);
+    in_tensor = input_node->data();
+  }
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[NNA] Paddings size should be the same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
+  // Check depthwise mode, and decide whether use ConvolutionDepthwise Op
+  bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1);
+
+  // Filter node
+  std::shared_ptr<Node> filter_node = nullptr;
+  imgdnn_tensor filter_tensor;
+  bool per_channel = isScalesPerChannel(weight_scale);
+  TensorInfoReset(&qnt);
+  uint8_t *weights_u8 =
+      graph->GetBuilder()->GetBufromPool(filter_dims.production());
+  if (enable_int8) {
+    char *weight_src = static_cast<char *>(filter->raw_data());
+
+    qnt.type = IMGDNN_TYPE_Q_U8;
+    if (per_channel) {
+      qnt.scales.assign(weight_scale.begin(), weight_scale.end());
+      qnt.zero_points.assign(weight_scale.size(), 128);
+      qnt.count = oc;
+      qnt.axis = 1;
+    } else {
+      qnt.scales.push_back(weight_scale.at(0));
+      qnt.zero_points.push_back(128);
+    }
+    for (int i = 0; i < filter_dims.production(); i++) {
+      weights_u8[i] = static_cast<uint8_t>(weight_src[i] + 128);
+    }
+
+    filter_node = graph->Add(filter_name,
+                             weights_u8,
+                             filter_dims.Vectorize(),
+                             qnt,
+                             Node::Role::kConst);
+    filter_tensor = filter_node->data();
+  } else {
+    qnt.type = IMGDNN_TYPE_F32;
+    filter_node = graph->Add(filter_name, *filter, qnt, Node::Role::kConst);
+  }
+
+  // Add bias node if exists bias
+  // Supports the bias nodes with the following dimensions
+  // 0: {oc}
+  // 1: {1, oc, oh, ow}
+  // 2: {n, oc, oh, ow}
+  std::shared_ptr<Node> bias_node = NULL;
+  imgdnn_tensor bias_tensor = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      auto output_data_size = output_dims.production();
+      std::vector<int64_t> bias_shape;
+      if (bias_data_size == oc) {
+        // 0: {oc}
+        bias_shape = {1, oc, 1, 1};
+      } else if (bias_data_size == output_data_size / bs) {
+        // 1: {1, oc, oh, ow}
+        bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
+      } else if (bias_data_size == output_data_size) {
+        // 2: {n, oc, oh, ow}
+        bias_shape = output_dims.Vectorize();
+      } else {
+        LOG(WARNING)
+            << "[NNA] Bias dimension " << bias_dims
+            << " isn't supported in conv2d Op when output dimension is "
+            << output_dims;
+        return FAILED;
+      }
+
+      TensorInfoReset(&qnt);
+      std::vector<int64_t> shapes{1, oc};
+      auto bias_data = bias->data<float, float>();
+      if (enable_int8) {
+        qnt.type = IMGDNN_TYPE_I32;
+        if (per_channel) {
+          qnt.scales.resize(bias_data_size);
+          for (int i = 0; i < bias_data_size; i++)
+            qnt.scales[i] = input_scale * weight_scale[i];
+          qnt.zero_points.assign(bias_data_size, 0);
+          qnt.count = 2;
+          qnt.axis = 1;
+        } else {
+          qnt.scales.push_back(input_scale * weight_scale[0]);
+          qnt.zero_points.push_back(0);
+        }
+
+        int quant_bits = 32;
+        auto dtype_max = static_cast<int>((1 << (quant_bits - 1)) - 1);
+        auto dtype_min = static_cast<int>(0 - dtype_max);
+
+        int32_t *bias_qnt_data =
+            reinterpret_cast<int32_t *>(graph->GetBuilder()->GetBufromPool(
+                bias_dims.production() * sizeof(int32_t)));
+        for (int i = 0; i < bias_data_size; i++) {
+          float current_scale = per_channel ? qnt.scales[i] : qnt.scales[0];
+          bias_qnt_data[i] =
+              std::min(std::max(static_cast<int>(bias_data[i] / current_scale),
+                                dtype_min),
+                       dtype_max);
+        }
+
+        bias_node = graph->Add(
+            bias_name, bias_qnt_data, shapes, qnt, Node::Role::kConst);
+      } else {
+        qnt.type = IMGDNN_TYPE_F32;
+        std::vector<float> bias_float_data(bias_data,
+                                           bias_data + bias_data_size);
+        bias_node = graph->Add(
+            bias_name, bias_float_data.data(), shapes, qnt, Node::Role::kConst);
+      }
+      bias_tensor = bias_node->data();
+    }
+  }
+
+  unsigned int img_stride[2] = {(unsigned int)strides[0],
+                                (unsigned int)strides[1]};
+  unsigned int pad_to_begin[2] = {(unsigned int)paddings[0],
+                                  (unsigned int)paddings[2]};  // top,left
+  unsigned int pad_to_end[2] = {(unsigned int)paddings[1],
+                                (unsigned int)paddings[3]};  // bottom,right
+  unsigned int img_dilation[2] = {(unsigned int)dilations[0],
+                                  (unsigned int)dilations[1]};
+
+  imgdnn_quant_param output_quant_param;
+  output_quant_param.scale = output_scale;
+  output_quant_param.zero_point = 128;
+
+  imgdnn_tensor conv_out =
+      graph->GetBuilder()->createConvolutionLayer(in_tensor,
+                                                  filter_tensor,
+                                                  bias_tensor,
+                                                  output_quant_param,
+                                                  img_stride,
+                                                  pad_to_begin,
+                                                  pad_to_end,
+                                                  img_dilation,
+                                                  is_depthwise_mode);
+
+  if (!act_type.empty()) {
+    imgdnn_tensor act_out;
+    if (act_type == "leaky_relu") {
+      act_out = graph->GetBuilder()->createReLULayer(
+          conv_out, false, 0.0, false, 0.0, leaky_relu_alpha);
+    } else if (act_type == "relu6") {
+      act_out = graph->GetBuilder()->createReLULayer(
+          conv_out, true, 0.0, true, 6.0, false);
+    } else if (act_type == "relu") {
+      act_out = graph->GetBuilder()->createReLULayer(
+          conv_out, true, 0.0, false, 0.0, false);
+    } else {
+      VLOG(3) << "act_type: " << act_type << " Not handled";
+    }
+    graph->Add(output_name, act_out, IMGDNN_TYPE_Q_U8);
+  } else {
+    graph->Add(output_name, conv_out, IMGDNN_TYPE_Q_U8);
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace nna
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kNNA,
+                         paddle::lite::subgraph::nna::ConvConverter);
+
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kNNA,
+                         paddle::lite::subgraph::nna::ConvConverter);
--- a/lite/kernels/nna/bridges/fc_op.cc
+++ b/lite/kernels/nna/bridges/fc_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "imgdnn.h"  // NOLINT
+#include "lite/kernels/nna/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace nna {
+
+int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NNA] Converting " + op_type + "...";
+
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindTensor(input_name);
+  auto input_dims = input->dims();
+
+  auto weight_name = op_info->Input("W").front();
+  auto weights = scope->FindTensor(weight_name);
+  auto w_dims = weights->dims();
+  CHECK_EQ(w_dims.size(), 2UL);
+
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindTensor(out_name);
+  auto out_dims = out->dims();
+
+  // notes : m, input row
+  //         k, input col
+  //         n, weight col
+  // input_dims : {1,1024,1,1}
+  // in_num_col_dims : 1
+  // m =1, k=1024,n=1000
+  // w_dims : {1024,1000}
+  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  int m = input_dims.Slice(0, in_num_col_dims).production();
+  int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
+  int n = w_dims[1];
+  CHECK_EQ(k * n, w_dims.production());
+  VLOG(3) << "[NNA] input dims: " << input_dims << " w dims: " << w_dims
+          << " m: " << m << " k: " << k << " n: " << n;
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  std::vector<float> weight_scale;
+  TensorInfo qnt;
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    output_scale = op_info->GetAttr<float>("output_scale");
+    weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+  }
+
+  // Create input node and reshape it to (m, k, 1, 1)
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    LOG(FATAL) << "[NNA] input node: " << input_name << ", could not be found";
+  }
+
+  // weight tensor
+  std::shared_ptr<Node> weight_node = nullptr;
+  bool per_channel = isScalesPerChannel(weight_scale);
+  uint8_t* weights_u8 = graph->GetBuilder()->GetBufromPool(w_dims.production());
+  if (enable_int8) {
+    qnt.type = IMGDNN_TYPE_Q_U8;
+    if (per_channel) {
+      LOG(FATAL)
+          << "[NNA] FC per-channel quantization is not supported for Mirage";
+    } else {
+      qnt.scales.push_back(weight_scale.at(0));
+      qnt.zero_points.push_back(128);
+    }
+    const char* weight_src = static_cast<const char*>(weights->raw_data());
+    for (int i = 0; i < w_dims.production(); i++)
+      weights_u8[i] = static_cast<uint8_t>(weight_src[i] + 128);
+  } else {
+    LOG(FATAL) << "[NNA] PaddleLite Only 8-bits quantization.";
+  }
+  weight_node = graph->Add(
+      weight_name, weights_u8, w_dims.Vectorize(), qnt, Node::Role::kConst);
+
+  // Add bias node if bias tensor exists
+  imgdnn_tensor bias_tensor = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    std::shared_ptr<Node> bias_node = nullptr;
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias = scope->FindTensor(bias_name);
+      auto bias_dims = bias->dims();
+      CHECK_EQ(bias_dims.production(), n);
+
+      if (enable_int8 && bias->precision() == PRECISION(kFloat)) {
+        TensorInfoReset(&qnt);
+        qnt.type = IMGDNN_TYPE_I32;
+        if (per_channel) {
+          qnt.scales.resize(weight_scale.size());
+          qnt.count = bias_dims.size();
+          qnt.axis = 0;
+          for (int i = 0; i < weight_scale.size(); i++) {
+            qnt.scales[i] = input_scale * weight_scale[i];
+          }
+          LOG(FATAL)
+              << "[NNA] per-channel quantization is not supported for FC";
+        } else {
+          qnt.scales.push_back(weight_scale.at(0) * input_scale);
+          qnt.zero_points.push_back(0);
+        }
+
+        int quant_bits = 32;
+        auto dtype_max = static_cast<int>((1 << (quant_bits - 1)) - 1);
+        auto dtype_min = static_cast<int>(0 - dtype_max);
+
+        auto* bias_data = bias->data<float, float>();
+        int32_t* bias_qnt_data =
+            reinterpret_cast<int32_t*>(graph->GetBuilder()->GetBufromPool(
+                bias_dims.production() * sizeof(int32_t)));
+        for (int i = 0; i < n; i++) {
+          float current_scale = per_channel ? qnt.scales[i] : qnt.scales[0];
+          bias_qnt_data[i] =
+              std::min(std::max(static_cast<int>(bias_data[i] / current_scale),
+                                dtype_min),
+                       dtype_max);
+        }
+
+        std::vector<int64_t> shapes{1};
+        bias_node = graph->Add(
+            bias_name, bias_qnt_data, shapes, qnt, Node::Role::kConst);
+      } else {
+        qnt.type = IMGDNN_TYPE_F32;
+        bias_node = graph->Add(bias_name, *bias, qnt, Node::Role::kConst);
+      }
+    }
+    bias_tensor = bias_node->data();
+  }
+
+  imgdnn_quant_param output_quant_param;
+  output_quant_param.scale = output_scale;
+  output_quant_param.zero_point = 128;
+  imgdnn_tensor fc_out_tensor = graph->GetBuilder()->createFullyConnectedLayer(
+      input_node->data(), weight_node->data(), bias_tensor, output_quant_param);
+
+  imgdnn_tensor_descriptor desc;
+  imgdnn_err_code err = imgdnnGetTensorDescriptor(fc_out_tensor, &desc);
+  graph->Add(out_name, fc_out_tensor, desc.type);
+  CHECK(err == IMGDNN_SUCCESS) << "fail get tensor description(FC)";
+
+  // reshape to out_dims
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace nna
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(fc, kNNA, paddle::lite::subgraph::nna::FCConverter);
--- a/lite/kernels/nna/bridges/graph.cc
+++ b/lite/kernels/nna/bridges/graph.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/nna/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/nna/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace nna {
+
+// Add 1
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    // Only intermediate node can be shared with the same name
+    if (!node->is_data() || !it->second.back()->is_data()) {
+      LOG(FATAL) << "[NNA] Const or Input node " << name << " is redefined.";
+      return -1;
+    }
+  } else {
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  it->second.push_back(node);
+  return it->second.size();
+}
+
+// Add 2
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const void* const const_data,
+                                 std::vector<int64_t> shape,
+                                 const TensorInfo& qnt,
+                                 Node::Role role /* = Node::Role::kData*/) {
+  auto node = std::make_shared<Node>(qnt.type, qnt.layout, role);
+  auto idx = Add(name, node);
+  CHECK_GE(idx, 1);
+
+  imgdnn_tensor_descriptor desc;
+  desc.type = qnt.type;
+  desc.dimensions = (unsigned)shape.size();
+  for (uint32_t i = 0; i < shape.size(); ++i) desc.size[i] = shape[i];
+
+  switch (qnt.type) {
+    case IMGDNN_TYPE_F32:
+    case IMGDNN_TYPE_I32:
+      break;
+    case IMGDNN_TYPE_Q_I8:
+    case IMGDNN_TYPE_Q_U8:
+      desc.quant_param.scale = qnt.scales[0];
+      desc.quant_param.zero_point = qnt.zero_points[0];
+      break;
+    case IMGDNN_TYPE_QPA_I8:
+    case IMGDNN_TYPE_QPA_U8:
+      desc.quant_param.per_axis = imgdnnCreatePerAxisQuantParam(
+          qnt.axis, qnt.count, qnt.scales.data(), qnt.zero_points.data());
+      CHECK(desc.quant_param.per_axis != nullptr);
+      break;
+    default:
+      LOG(FATAL) << "[NNA] invalid tensor type set in node: " << name;
+      return nullptr;
+  }
+
+  imgdnn_tensor out_tensor;
+  if (role == Node::Role::kConst) {
+    out_tensor = pImgdnnMgr->createFixedInputTensor(&desc, const_data, true);
+  } else {
+    LOG(INFO) << "[NNA] invald role set in this path: " << name;
+  }
+
+  if ((desc.type == IMGDNN_TYPE_QPA_I8 || desc.type == IMGDNN_TYPE_QPA_U8) &&
+      desc.quant_param.per_axis != nullptr)
+    imgdnnDestroyPerAxisQuantParam(desc.quant_param.per_axis);
+
+  node->set_data(out_tensor);
+
+  return node;
+}
+
+// Add 3
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const Tensor& tensor,
+                                 std::vector<int64_t> shape,
+                                 const TensorInfo& qnt,
+                                 Node::Role role) {
+  auto node = std::make_shared<Node>(qnt.type, qnt.layout, role);
+  auto idx = Add(name, node);
+  CHECK_GE(idx, 1);
+
+  imgdnn_tensor_descriptor desc;
+  desc.type = qnt.type;
+  desc.dimensions = (unsigned)shape.size();
+  for (uint32_t i = 0; i < shape.size(); ++i) desc.size[i] = shape[i];
+
+  switch (qnt.type) {
+    case IMGDNN_TYPE_F32:
+    case IMGDNN_TYPE_I32:
+      break;
+    case IMGDNN_TYPE_Q_I8:
+    case IMGDNN_TYPE_Q_U8:
+      desc.quant_param.scale = qnt.scales[0];
+      desc.quant_param.zero_point = qnt.zero_points[0];
+      break;
+    case IMGDNN_TYPE_QPA_I8:
+    case IMGDNN_TYPE_QPA_U8:
+      desc.quant_param.per_axis = imgdnnCreatePerAxisQuantParam(
+          qnt.axis, qnt.count, qnt.scales.data(), qnt.zero_points.data());
+      CHECK(desc.quant_param.per_axis != nullptr);
+      break;
+    default:
+      LOG(FATAL) << "[NNA] invalid tensor type set in node: " << name;
+      return nullptr;
+  }
+
+  imgdnn_tensor out_tensor;
+  if (role == Node::Role::kInput) {
+    out_tensor = pImgdnnMgr->createInputTensor(&desc);
+  } else if (role == Node::Role::kConst) {
+    const void* const_data = tensor.raw_data();
+    out_tensor = pImgdnnMgr->createFixedInputTensor(&desc, const_data, false);
+  } else {
+    LOG(INFO) << "[NNA] invald role set in this path: " << name;
+  }
+
+  if ((desc.type == IMGDNN_TYPE_QPA_I8 || desc.type == IMGDNN_TYPE_QPA_U8) &&
+      desc.quant_param.per_axis != nullptr)
+    imgdnnDestroyPerAxisQuantParam(desc.quant_param.per_axis);
+
+  node->set_data(out_tensor);
+
+  return node;
+}
+
+}  // namespace nna
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/nna/bridges/graph.h
+++ b/lite/kernels/nna/bridges/graph.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <math.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "imgdnn.h"  // NOLINT
+#include "lite/backends/nna/imgdnn_manager.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "utility.h"  // NOLINT
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace nna {
+
+#define NNA_UNUSED(var) \
+  do {                  \
+    (void)(var);        \
+  } while (0)
+
+// Graph and node is defined to collect all of converted IMGDNN IR nodes
+class Node {
+ public:
+  enum class Role {
+    kInput = 0,
+    kConst,
+    kData,
+  };
+
+  Node(imgdnn_tensor data, imgdnn_type type, DataLayoutType layout, Role role)
+      : data_(data), type_(type), layout_(layout), role_(role) {}
+
+  Node(imgdnn_type type, DataLayoutType layout, Role role)
+      : type_(type), layout_(layout), role_(role) {}
+
+  void set_data(imgdnn_tensor data) { data_ = data; }
+  void set_type(imgdnn_type type) { type_ = type; }
+  void set_layout(DataLayoutType layout) { layout_ = layout; }
+  void set_role(Role role) { role_ = role; }
+
+  template <typename T>
+  std::shared_ptr<T> data() {
+    return std::static_pointer_cast<T>(data_);
+  }
+  imgdnn_tensor data() { return data_; }
+  imgdnn_type type() const { return type_; }
+  DataLayoutType layout() const { return layout_; }
+
+  bool is_input() const { return role_ == Role::kInput; }
+  bool is_const() const { return role_ == Role::kConst; }
+  bool is_data() const { return role_ == Role::kData; }
+
+ private:
+  imgdnn_tensor data_{nullptr};
+  imgdnn_type type_{IMGDNN_TYPE_MAX};
+  DataLayoutType layout_{DATALAYOUT(kNCHW)};
+  Role role_{Role::kData};
+};
+
+class Graph {
+ public:
+  explicit Graph(lite::nna::ImgdnnManager* pMgr) {
+    pImgdnnMgr = pMgr;
+    std::cout << "graph construct" << std::endl;
+  }
+
+  ~Graph() { std::cout << "Graph deconst" << std::endl; }
+
+  // Add 1
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Add 2, weights,bias
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const void* const const_data,
+                            std::vector<int64_t> shape,
+                            const TensorInfo& qnt,
+                            Node::Role role /* = Node::Role::kData*/);
+
+  // Add 3
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            std::vector<int64_t> shape,
+                            const TensorInfo& qnt,
+                            Node::Role role);
+  // Add 4
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            const TensorInfo& qnt,
+                            Node::Role role) {
+    return Add(name, tensor, tensor.dims().Vectorize(), qnt, role);
+  }
+
+  // Used to add intermediate tensor
+  // Add 5
+  int Add(const std::string& name,
+          imgdnn_tensor tensor,
+          imgdnn_type type,
+          DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    Node::Role role = Node::Role::kData;
+    auto node = std::make_shared<Node>(type, layout, role);
+    node->set_data(tensor);
+    return Add(name, node);  // call Add 1
+  }
+
+  std::shared_ptr<Node> Get(std::string name) {
+    CHECK(Has(name)) << "[NNA] Node " << name << " not found.";
+    return nodes_.at(name).back();
+  }
+
+  bool Has(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+  lite::nna::ImgdnnManager* GetBuilder() {
+    ASSERT(pImgdnnMgr == nullptr, "pImgdnnMgr used before initialize");
+    return pImgdnnMgr;
+  }
+
+ private:
+  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
+  lite::nna::ImgdnnManager* pImgdnnMgr{nullptr};
+};
+}  // namespace nna
+
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/nna/bridges/paddle_use_bridges.h
+++ b/lite/kernels/nna/bridges/paddle_use_bridges.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(relu, kNNA);
+USE_SUBGRAPH_BRIDGE(conv2d, kNNA);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kNNA);
+USE_SUBGRAPH_BRIDGE(fc, kNNA);
+USE_SUBGRAPH_BRIDGE(pool2d, kNNA);
+// USE_SUBGRAPH_BRIDGE(softmax, kNNA);
--- a/lite/kernels/nna/bridges/pool_op.cc
+++ b/lite/kernels/nna/bridges/pool_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pool_op.h"
+#include "imgdnn.h"  // NOLINT
+#include "lite/kernels/nna/bridges/graph.h"
+#include "lite/kernels/nna/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace nna {
+
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NNA] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+
+  // for quantization
+  float output_scale = 1.0;
+
+  if (op_info->HasAttr("enable_int8")) {
+    output_scale = op_info->GetAttr<float>("output_scale");
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    // x_node = graph->Add(x_name, *x);
+    LOG(INFO) << "[NNA] Pooling input not found: " << x_name;
+  }
+
+  // pool mode
+  imgdnn_pooling_type img_pool_type;
+  if (pooling_type == "max") {
+    img_pool_type = IMGDNN_POOLING_MAX;
+  } else if (pooling_type == "avg") {
+    img_pool_type = IMGDNN_POOLING_AVERAGE;
+  } else {
+    LOG(WARNING) << "[NNA] Unsupported pooling type: " << pooling_type;
+    return FAILED;
+  }
+
+  // pad mode
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  // paddings and strides
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[NNA] Paddings size should be the same or twice as the inputs size.";
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  lite::operators::UpdatePadding(&paddings,
+                                 global_pooling,
+                                 adaptive,
+                                 padding_algorithm,
+                                 x->dims(),
+                                 strides,
+                                 ksize);
+
+  // ceil mode
+  /* bool ceil_mode =
+      op_info->HasAttr("ceil_mode") && op_info->GetAttr<bool>("ceil_mode");
+  */
+
+  unsigned int img_ksize[2] = {(unsigned int)ksize[0], (unsigned int)ksize[1]};
+  unsigned int img_stride[2] = {(unsigned int)strides[0],
+                                (unsigned int)strides[1]};
+  unsigned int pad_to_begin[2] = {(unsigned int)paddings[0],
+                                  (unsigned int)paddings[2]};  // top,left
+  unsigned int pad_to_end[2] = {(unsigned int)paddings[1],
+                                (unsigned int)paddings[3]};  // bottom,right
+
+  if (global_pooling) {
+    img_ksize[0] = x_dims[2];
+    img_ksize[1] = x_dims[3];
+  }
+
+  imgdnn_quant_param output_quant_param;
+  output_quant_param.scale = output_scale;
+  output_quant_param.zero_point = 128;
+  imgdnn_tensor pooling_out =
+      graph->GetBuilder()->createPoolingLayer(x_node->data(),
+                                              output_quant_param,
+                                              img_ksize,
+                                              img_stride,
+                                              pad_to_begin,
+                                              pad_to_end,
+                                              img_pool_type);
+
+  // LOG(INFO) << "pooling op output:" << static_cast<int>(pooling_out);
+
+  imgdnn_tensor_descriptor desc;
+  imgdnn_err_code err = imgdnnGetTensorDescriptor(pooling_out, &desc);
+  CHECK(err == IMGDNN_SUCCESS) << "fail get tensor description(POOL)";
+
+  graph->Add(out_name, pooling_out, desc.type);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace nna
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kNNA,
+                         paddle::lite::subgraph::nna::PoolConverter);
--- a/lite/kernels/nna/bridges/softmax_op.cc
+++ b/lite/kernels/nna/bridges/softmax_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/nna/bridges/graph.h"
+#include "lite/kernels/nna/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace nna {
+
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NNA] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto x_rank = x_dims.size();
+  auto out_name = op_info->Output("Out").front();
+  int axis = op_info->HasAttr("axis") ? op_info->GetAttr<int>("axis") : -1;
+  if (axis < 0) {
+    axis += x_rank;
+  }
+
+  // for quantization
+  float output_scale = 1.0;
+
+  if (op_info->HasAttr("enable_int8")) {
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    // X node
+    std::shared_ptr<Node> x_node = nullptr;
+    if (graph->Has(x_name)) {
+      x_node = graph->Get(x_name);
+    } else {
+      LOG(FATAL) << "[NNA] Softmax: Could not find the input tensor.";
+    }
+
+    imgdnn_quant_param output_quant_param;
+    output_quant_param.scale = output_scale;
+    output_quant_param.zero_point = 128;
+    imgdnn_tensor softmax_out_tensor = graph->GetBuilder()->createSoftmaxLayer(
+        x_node->data(), 1.0, axis, output_quant_param);
+
+    graph->Add(out_name, softmax_out_tensor, IMGDNN_TYPE_Q_U8);
+  } else {
+    LOG(FATAL) << "[NNA] Softmax: has no enable_int8 attribute.";
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace nna
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kNNA,
+                         paddle::lite::subgraph::nna::SoftmaxConverter);
--- a/lite/kernels/nna/bridges/utility.cc
+++ b/lite/kernels/nna/bridges/utility.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/nna/bridges/utility.h"
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace nna {
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+
+bool isScalesPerChannel(std::vector<float> scales) {
+  bool per_channel = false;
+  for (std::vector<float>::iterator iter = scales.begin() + 1;
+       iter != scales.end();
+       iter++) {
+    if (*iter != scales.at(0)) {
+      per_channel = true;
+      break;
+    }
+  }
+  return per_channel;
+}
+
+void TensorInfoReset(TensorInfo* qnt) {
+  qnt->count = 0;
+  qnt->axis = 0;
+  qnt->scales.clear();
+  // qnt.scales.shrink_to_fit();
+  qnt->zero_points.clear();
+  // qnt.zero_points.shrink_to_fit();
+  qnt->layout = DATALAYOUT(kNCHW);
+}
+
+}  // namespace nna
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/nna/bridges/utility.h
+++ b/lite/kernels/nna/bridges/utility.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "imgdnn.h"  // NOLINT
+#include "lite/core/op_lite.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace nna {
+
+struct TensorInfo {
+  imgdnn_type type;
+  std::vector<float> scales;
+  std::vector<int> zero_points;
+  DataLayoutType layout;
+  unsigned count;
+  unsigned axis;
+};
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+bool isScalesPerChannel(std::vector<float> scales);
+
+void TensorInfoReset(TensorInfo* qnt);
+
+}  // namespace nna
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/nna/subgraph_compute.cc
+++ b/lite/kernels/nna/subgraph_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/nna/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <limits>
+#include <utility>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/nna/bridges/graph.h"
+#include "lite/kernels/nna/bridges/paddle_use_bridges.h"
+#include "lite/kernels/nna/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace nna {
+
+bool SubgraphEngine::BuildDeviceProgram() {
+  int status = 0;
+  // Convert all of ops and their input vars and weights and added into the NNA
+  // IMG IR graph
+  subgraph::nna::Graph graph{&imgdnn_mgr_};
+  const auto& bridges = subgraph::Registry::Instance();
+  if (!origin_program_) {
+    BuildOriginProgram();
+  }
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
+    auto op = const_cast<OpLite*>(inst.op());
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kNNA))) {
+      // return subgraph::FAILED;
+      return false;
+    }
+    auto kernel = inst.kernel();
+    status |=
+        bridges.Select(op_type, TARGET(kNNA))(reinterpret_cast<void*>(&graph),
+                                              const_cast<OpLite*>(op),
+                                              const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      // return subgraph::FAILED;
+      return false;
+    }
+  }
+
+  // Collect the valid input and output nodes in the IMGDNN IR graph and update
+  // the input and output names
+  device_inames_.clear();
+  std::vector<imgdnn_tensor> device_inodes;
+  for (auto& input_name : input_names_) {
+    if (graph.Has(input_name)) {
+      device_inodes.push_back(graph.Get(input_name)->data());
+      device_inames_.push_back(input_name);
+    } else {
+      LOG(WARNING) << "[NNA] Input node " << input_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+
+  device_onames_.clear();
+  std::vector<imgdnn_tensor> device_onodes;
+  for (auto& output_name : output_names_) {
+    if (graph.Has(output_name)) {
+      device_onodes.push_back(graph.Get(output_name)->data());
+      device_onames_.push_back(output_name);
+    } else {
+      LOG(WARNING) << "[NNA] Output node " << output_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+  CHECK(!device_inames_.empty())
+      << "[NNA] No input nodes found for building NNA model";
+  CHECK(!device_onames_.empty())
+      << "[NNA] No output nodes found for building NNA model";
+
+  imgdnn_mgr_.createNetworkObject(device_inodes.size(),
+                                  device_inodes.data(),
+                                  device_onodes.size(),
+                                  device_onodes.data());
+
+  // inputs
+  unsigned int num_inputs, num_outputs;
+  imgdnn_mgr_.getNetworkObjectInputs(
+      std::numeric_limits<unsigned int>::max(), nullptr, &num_inputs);
+  CHECK_EQ(num_inputs, device_inames_.size());
+  // origin_idims_.resize(num_inputs);
+  // origin_itensors_.resize(num_inputs);
+  device_itensors_.resize(num_inputs);
+  imgdnn_mgr_.getNetworkObjectInputs(
+      num_inputs, device_itensors_.data(), nullptr);
+
+  // show input info
+  for (int i = 0; i < num_inputs; i++) {
+    auto node = graph.Get(device_inames_[i]);
+    auto type = node->type();
+    auto layout = node->layout();
+    // origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
+    // CHECK(origin_itensors_[i]);
+    // origin_idims_[i] = origin_itensors_[i]->dims();
+    VLOG(3) << "[NNA] Inputs[" << i << "] name: " << device_inames_[i]
+            << " type: " << type << " layout: " << DataLayoutToStr(layout);
+  }
+
+  // outputs
+  imgdnn_mgr_.getNetworkObjectOutputs(
+      std::numeric_limits<unsigned int>::max(), nullptr, &num_outputs);
+  CHECK_EQ(num_outputs, device_onames_.size());
+  // origin_odims_.resize(num_outputs);
+  // origin_otensors_.resize(num_outputs);
+  device_otensors_.resize(num_outputs);
+  imgdnn_mgr_.getNetworkObjectOutputs(
+      num_outputs, device_otensors_.data(), nullptr);
+  // show output info
+  for (int i = 0; i < num_outputs; i++) {
+    auto node = graph.Get(device_onames_[i]);
+    auto type = node->type();
+    auto layout = node->layout();
+    // origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
+    // CHECK(origin_otensors_[i]);
+    // origin_odims_[i] = origin_otensors_[i]->dims();
+    VLOG(3) << "[NNA] Outputs[" << i << "] name: " << device_onames_[i]
+            << " type: " << type << " layout: " << DataLayoutToStr(layout);
+    // Prepare the device output tensors
+    switch (type) {
+      case IMGDNN_TYPE_F32:
+        origin_otensors_[i]->mutable_data<float>();
+        break;
+      case IMGDNN_TYPE_Q_I8:
+      case IMGDNN_TYPE_Q_U8:
+        origin_otensors_[i]->mutable_data<int8_t>();
+        break;
+      case IMGDNN_TYPE_I16:
+        origin_otensors_[i]->mutable_data<int16_t>();
+        break;
+      case IMGDNN_TYPE_I32:
+        origin_otensors_[i]->mutable_data<int32_t>();
+        break;
+      default:
+        LOG(FATAL) << "[NNA] " << device_onames_[i]
+                   << " can't mutable data with precision type " << type;
+        break;
+    }
+  }
+
+  return true;
+}
+
+bool SubgraphEngine::LaunchDeviceProgram() {
+  // Set input buffer
+  for (size_t i = 0; i < origin_itensors_.size(); i++) {
+    // check input shapes
+    imgdnn_tensor_descriptor in_desc =
+        imgdnn_mgr_.getInputDescriptor(device_itensors_[i]);
+    size_t in_size = imgdnn_mgr_.getDescriptorSize(&in_desc);
+    CHECK_EQ(in_size, origin_itensors_[i]->memory_size());
+
+    auto origin_data = origin_itensors_[i]->mutable_data<int8_t>();
+    auto converted_data = reinterpret_cast<uint8_t*>(origin_data);
+    for (int j = 0; j < origin_itensors_[i]->data_size(); j++) {
+      converted_data[j] =
+          static_cast<uint8_t>(static_cast<int16_t>(origin_data[j]) + 128);
+    }
+
+    imgdnn_memory in_mem = imgdnn_mgr_.importMemory(
+        static_cast<void*>(converted_data), origin_itensors_[i]->memory_size());
+    imgdnn_mgr_.addBindingInput(device_itensors_[i], in_mem);
+  }
+
+  // Set output buffer
+  std::vector<imgdnn_memory> out_mems;
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
+    // check output shapes
+    imgdnn_tensor_descriptor out_desc =
+        imgdnn_mgr_.getOutputDescriptor(device_otensors_[i]);
+    size_t out_size = imgdnn_mgr_.getDescriptorSize(&out_desc);
+    CHECK_EQ(out_size, origin_otensors_[i]->memory_size());
+
+    imgdnn_memory out_mem =
+        imgdnn_mgr_.allocateMemory(origin_otensors_[i]->memory_size());
+    imgdnn_mgr_.addBindingOutput(device_otensors_[i], out_mem);
+    out_mems.push_back(out_mem);
+  }
+
+  // Run the img model by name
+  imgdnn_mgr_.executeNetworkObject(true, 0, nullptr, nullptr);
+
+  // Copy the data of output tensor to the buffer of origin output tensors
+  for (size_t i = 0; i < out_mems.size(); i++) {
+    uint8_t* data = static_cast<uint8_t*>(
+        imgdnn_mgr_.lockMemory(out_mems[i], IMGDNN_LOCK_ACCESS_READ_ONLY));
+
+    int8_t* output_data = origin_otensors_[i]->mutable_data<int8_t>();
+    for (size_t j = 0; j < origin_otensors_[i]->data_size(); j++) {
+      output_data[j] = data[j] - 128;
+    }
+    imgdnn_mgr_.unlockMemory(out_mems[i]);
+    imgdnn_mgr_.destroyMemory(out_mems[i]);
+  }
+
+  return true;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
+                                   param.input_data_names,
+                                   param.output_data_names));
+  CHECK(engine_);
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Run();
+}
+
+}  // namespace nna
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kNNA,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::nna::SubgraphCompute,
+                     def)
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt8))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt8))})
+    .Finalize();
--- a/lite/kernels/nna/subgraph_compute.h
+++ b/lite/kernels/nna/subgraph_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "imgdnn.h"  // NOLINT
+#include "lite/backends/nna/imgdnn_manager.h"
+#include "lite/core/kernel.h"
+#include "lite/kernels/nna/bridges/graph.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace nna {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext* ctx,
+                 int block_idx,
+                 const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+                 Scope* exec_scope,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
+
+  ~SubgraphEngine() {}
+
+ protected:
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
+
+  std::vector<std::string> device_inames_;
+  std::vector<std::string> device_onames_;
+  std::vector<imgdnn_input> device_itensors_;
+  std::vector<imgdnn_output> device_otensors_;
+  lite::nna::ImgdnnManager imgdnn_mgr_;
+};
+
+class SubgraphCompute
+    : public KernelLite<TARGET(kNNA), PRECISION(kInt8), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SubgraphCompute() {  // = default;
+    std::cout << "~SubgraphCompute" << std::endl;
+    engine_.reset();
+  }
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace nna
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -56,8 +56,8 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
 ops_lines = []

 # valid targets and valid_ops
-valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU"]
-valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
+valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU", "kNNA"]
+valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
 class TargetType:
    kUnk = 0
    kHost = 1
@@ -74,6 +74,7 @@ class TargetType:
    kRKNPU = 12
    kAPU = 13
    kHuaweiAscendNPU = 14
+    kNNA = 15


 # record op_info of valid kernels into `valid_ops` according to different target type

--- a/tools/codestyle/clang_format.hook
+++ b/tools/codestyle/clang_format.hook
 #!/bin/bash
 set -e

-readonly VERSION="3.8"
+readonly VERSION="6.0.0"

 version=$(clang-format -version)