[RKNPU] Add Rockchip NPU backend (#3382)

30145270 · airockchip · GitHub · bfe8b250 · 30145270 · 30145270
48 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_RKNPU  "Enable RKNPU in lite mode"  OFF)
 lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
@@ -129,6 +130,10 @@ if (LITE_WITH_PYTHON)
    include(external/pybind11)    # download, build, install pybind11
 endif()

+if(LITE_WITH_RKNPU)
+   include(device/rknpu)
+endif()
+

 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -134,6 +134,10 @@ if (LITE_WITH_NPU)
    add_definitions("-DLITE_WITH_NPU")
 endif()

+if (LITE_WITH_RKNPU)
+    add_definitions("-DLITE_WITH_RKNPU")
+endif()
+
 if (LITE_WITH_XPU)
    add_definitions("-DLITE_WITH_XPU")
    if (LITE_WITH_XTCL)

--- a/cmake/device/rknpu.cmake
+++ b/cmake/device/rknpu.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+
+if(NOT DEFINED RKNPU_DDK_ROOT)
+    set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT})
+    if(NOT RKNPU_DDK_ROOT)
+        message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON")
+    endif()
+endif()
+
+message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}")
+find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h
+  PATHS ${RKNPU_DDK_ROOT}/include/  NO_DEFAULT_PATH)
+if(NOT RKNPU_DDK_INC)
+  message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include")
+endif()
+
+include_directories("${RKNPU_DDK_ROOT}/include")
+
+set(RKNPU_SUB_LIB_PATH "lib64")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(RKNPU_SUB_LIB_PATH "lib64")
+endif()
+
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+    set(RKNPU_SUB_LIB_PATH "lib")
+endif()
+
+find_library(RKNPU_DDK_FILE NAMES rknpu_ddk
+  PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH})
+
+if(NOT RKNPU_DDK_FILE)
+  message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}")
+else()
+  message(STATUS "Found RKNPU_DDK_FILE  Library: ${RKNPU_DDK_FILE}")
+  add_library(rknpu_ddk  SHARED IMPORTED GLOBAL)
+  set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE})
+endif()
+
+set(rknpu_runtime_libs rknpu_ddk  CACHE INTERNAL "rknpu ddk runtime libs")
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -88,6 +88,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_RKNPU)
+    foreach(var ${lite_deps_RKNPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  if (LITE_WITH_XPU)
    foreach(var ${lite_deps_XPU_DEPS})
      set(deps ${deps} ${var})
@@ -131,7 +137,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -142,6 +148,7 @@ function(lite_cc_library TARGET)
            CUDA_DEPS ${args_CUDA_DEPS}
            CL_DEPS ${args_CL_DEPS}
            BM_DEPS ${args_BM_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
            CV_DEPS ${args_CV_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
@@ -177,7 +184,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -191,6 +198,7 @@ function(lite_cc_binary TARGET)
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
            BM_DEPS ${args_BM_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -226,7 +234,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -248,6 +256,7 @@ function(lite_cc_test TARGET)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
@@ -280,6 +289,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
+set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")

@@ -295,12 +305,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
+# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -323,6 +333,12 @@ function(add_kernel TARGET device level)


    if ("${device}" STREQUAL "Host")
+       if (LITE_ON_MODEL_OPTIMIZE_TOOL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
        set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "ARM")
@@ -379,6 +395,15 @@ function(add_kernel TARGET device level)
        endif()
        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "RKNPU")
+        if (NOT LITE_WITH_RKNPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "MLU")
        if (NOT LITE_WITH_MLU)
            foreach(src ${args_SRCS})
@@ -427,6 +452,7 @@ function(add_kernel TARGET device level)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
@@ -481,6 +507,7 @@ function(add_operator TARGET level)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -7,6 +7,7 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
 message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
+message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
 message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
@@ -76,6 +77,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
    if (LITE_WITH_BM)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
    endif(LITE_WITH_BM)
+    if (LITE_WITH_RKNPU)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu")
+    endif(LITE_WITH_RKNPU)
 else()
    set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
 endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -34,9 +34,11 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                  ARM_DEPS ${arm_kernels}
                  CV_DEPS paddle_cv_arm
                  NPU_DEPS ${npu_kernels}
+                  RKNPU_DEPS ${rknpu_kernels}
                  )
+
    add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels})
    if(NOT APPLE)
        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
        set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
@@ -59,6 +61,11 @@ else()
            # Need to add HIAI runtime libs (libhiai.so) dependency
            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
        endif()
+        if (LITE_WITH_RKNPU)
+            # Need to add RKNPU runtime libs dependency
+            target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs})
+        endif()
+
    endif()
 endif()

@@ -69,6 +76,7 @@ if (WITH_TESTING)
      CUDA_DEPS ${cuda_kernels}
      X86_DEPS ${x86_kernels}
      XPU_DEPS ${xpu_kernels}
+      RKNPU_DEPS ${rknpu_kernels}
      BM_DEPS ${bm_kernels}
      MLU_DEPS ${mlu_kernels})
 endif()
@@ -82,6 +90,12 @@ if(LITE_WITH_BM)
    set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
 endif()

+if(LITE_WITH_RKNPU)
+    set(light_api_deps ${light_api_deps} ${rknpu_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
+endif()
+
+
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
 message(STATUS "get CUDA kernels ${cuda_kernels}")
@@ -90,6 +104,7 @@ message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get OpenCL kernels ${opencl_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
+message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
 message(STATUS "get MLU kernels ${mlu_kernels}")
@@ -107,6 +122,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        CV_DEPS paddle_cv_arm
                        NPU_DEPS ${npu_kernels}
                        XPU_DEPS ${xpu_kernels}
+                        RKNPU_DEPS ${rknpu_kernels}
                        BM_DEPS ${bm_kernels}
                        CL_DEPS ${opencl_kernels}
                        FPGA_DEPS ${fpga_kernels})
@@ -128,6 +144,7 @@ lite_cc_library(light_api SRCS light_api.cc
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
@@ -147,6 +164,7 @@ if(WITH_TESTING)
       CV_DEPS paddle_cv_arm
       NPU_DEPS ${npu_kernels}
       XPU_DEPS ${xpu_kernels}
+       RKNPU_DEPS ${rknpu_kernels}
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels}
       BM_DEPS ${bm_kernels}
@@ -248,6 +266,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
            --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
    add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
+
   # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
   # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
   #    DEPS ${lite_model_test_DEPS})
@@ -291,6 +310,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
        DEPS light_api program mir_passes paddle_api_light
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        BM_DEPS ${bm_kernels}
        ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)

@@ -300,6 +320,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
        X86_DEPS ${x86_kernels}
        XPU_DEPS ${xpu_kernels}
        FPGA_DEPS ${fpga_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        BM_DEPS ${bm_kernels}
        MLU_DEPS ${mlu_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
@@ -335,6 +356,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
  CV_DEPS paddle_cv_arm
  NPU_DEPS ${npu_kernels}
  XPU_DEPS ${xpu_kernels}
+  RKNPU_DEPS ${rknpu_kernels}
  CL_DEPS ${opencl_kernels}
  X86_DEPS ${x86_kernels}
  FPGA_DEPS ${fpga_kernels}
@@ -356,6 +378,7 @@ if(NOT IOS)
        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -369,6 +392,7 @@ if(NOT IOS)
        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -382,6 +406,7 @@ if(NOT IOS)
        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -392,6 +417,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -404,17 +430,20 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
 	BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
+
    lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        RKNPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -109,6 +109,10 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(TARGET(kNPU));
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "rknpu") {
+      valid_places.emplace_back(TARGET(kRKNPU));
+      valid_places.emplace_back(
+          TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
    } else if (target_repr == "mlu") {
      valid_places.emplace_back(TARGET(kMLU));
    } else {
@@ -187,6 +191,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                      "kFPGA",
                                      "kNPU",
                                      "kXPU",
+                                      "kRKNPU",
                                      "kAny",
                                      "kUnk"};
  int maximum_optype_length = 0;
@@ -251,16 +256,16 @@ void PrintHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
      "  Display operators in the input model\n";
  std::cout << "opt version:" << opt_version << std::endl
            << help_info << std::endl;

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -72,7 +72,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "npu",
                                              "xpu",
                                              "bm",
-                                              "mlu"};
+                                              "mlu",
+                                              "rknpu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -113,7 +114,8 @@ const std::string& TargetRepr(TargetType target) {
                                              "kNPU",
                                              "kXPU",
                                              "kMLU",
-                                              "kBM"};
+                                              "kBM",
+                                              "kRKNPU"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -54,8 +54,9 @@ enum class TargetType : int {
  kXPU = 9,
  kBM = 10,
  kMLU = 11,
+  kRKNPU = 12,
  kAny = 6,  // any target
-  NUM = 12,  // number of fields.
+  NUM = 13,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -8,3 +8,4 @@ add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
 add_subdirectory(bm)
+add_subdirectory(rknpu)
--- a/lite/backends/rknpu/CMakeLists.txt
+++ b/lite/backends/rknpu/CMakeLists.txt
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+
+lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs})
--- a/lite/backends/rknpu/device.cc
+++ b/lite/backends/rknpu/device.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/rknpu/device.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace rknpu {
+
+std::unique_ptr<rk::nn::Exection> Device::Build(
+    std::string& model_name,                                   // NOLINT
+    rk::nn::Graph* rk_graph,                                   // NOLINT
+    std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes,  // NOLINT
+    std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes  // NOLINT
+    ) {
+  VLOG(3) << "[RKNPU] Build model";
+
+  rk_graph->SetInputsOutputs(input_nodes, output_nodes);
+
+  std::unique_ptr<rk::nn::Exection> exector =
+      std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(rk_graph));
+
+  exector->Build();
+
+  return exector;
+}
+
+}  // namespace rknpu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/rknpu/device.h
+++ b/lite/backends/rknpu/device.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "rknpu/rknpu_pub.h"  // NOLINT
+
+namespace paddle {
+namespace lite {
+namespace rknpu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {}
+
+  // Build the RK IR graph to om model, return RK model exector to
+  // load om model and run inference.
+  std::unique_ptr<rk::nn::Exection> Build(
+      std::string& model_name,                                   // NOLINT
+      rk::nn::Graph* rk_graph,                                   // NOLINT
+      std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes,  // NOLINT
+      std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes  // NOLINT
+      );                                                         // NOLINT
+
+ private:
+};
+
+}  // namespace rknpu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)

 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -59,6 +59,7 @@ using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
 using MLUContext = Context<TargetType::kMLU>;
+using RKNPUContext = Context<TargetType::kRKNPU>;

 template <>
 class Context<TargetType::kHost> {
@@ -103,6 +104,21 @@ class Context<TargetType::kBM> {
 };
 #endif

+#ifdef LITE_WITH_RKNPU
+template <>
+class Context<TargetType::kRKNPU> {
+ public:
+  Context() {}
+  explicit Context(const RKNPUContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(RKNPUContext* ctx) {}
+
+  RKNPUContext& operator=(const RKNPUContext& ctx) {}
+  std::string name() const { return "RKNPUContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_XPU
 template <>
 class Context<TargetType::kXPU> {
@@ -392,6 +408,12 @@ class ContextScheduler {
            &ctx->As<NPUContext>());
        break;
 #endif
+#ifdef LITE_WITH_RKNPU
+      case TARGET(kRKNPU):
+        kernel_contexts_[TargetType::kRKNPU].As<RKNPUContext>().CopySharedTo(
+            &ctx->As<RKNPUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_XPU
      case TARGET(kXPU):
        kernel_contexts_[TargetType::kXPU].As<XPUContext>().CopySharedTo(
@@ -461,6 +483,9 @@ class ContextScheduler {
 #ifdef LITE_WITH_NPU
    InitContext<TargetType::kNPU, NPUContext>();
 #endif
+#ifdef LITE_WITH_RKNPU
+    InitContext<TargetType::kRKNPU, RKNPUContext>();
+#endif
 #ifdef LITE_WITH_XPU
    InitContext<TargetType::kXPU, XPUContext>();
 #endif

--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -313,4 +313,4 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
    .BindTargets({TARGET(kARM), TARGET(kOpenCL)})
-    .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
+    .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM), TARGET(kRKNPU)});
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply(

 REGISTER_MIR_PASS(quantized_op_attributes_inference_pass,
                  paddle::lite::mir::QuantizedOpAttributesInferencePass)
-    .BindTargets({TARGET(kNPU)});
+    .BindTargets({TARGET(kNPU), TARGET(kRKNPU)});
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -69,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }

+void RKNPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  std::unordered_set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
@@ -93,5 +107,7 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
    .BindTargets({TARGET(kXPU)});
 REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
    .BindTargets({TARGET(kBM)});
+REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass)
+    .BindTargets({TARGET(kRKNPU)});
 REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
    .BindTargets({TARGET(kMLU)});
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };

+class RKNPUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 class MLUSubgraphPass : public ProgramPass {
 public:
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;

--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -110,6 +110,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
    case TARGET(kMLU): {
      CREATE_KERNEL(kMLU);
    } break;
+    case TARGET(kRKNPU): {
+      CREATE_KERNEL(kRKNPU);
+    } break;
    default:
      CHECK(false) << "not supported kernel target " << TargetToStr(target);
  }
@@ -232,6 +235,11 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kBM, kInt8, kNCHW);
  INIT_FOR(kBM, kAny, kNCHW);
  INIT_FOR(kBM, kAny, kAny);
+
+  INIT_FOR(kRKNPU, kFloat, kNCHW);
+  INIT_FOR(kRKNPU, kInt8, kNCHW);
+  INIT_FOR(kRKNPU, kAny, kNCHW);
+  INIT_FOR(kRKNPU, kAny, kAny);
 #undef INIT_FOR
 }


--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -251,6 +251,16 @@ class KernelRegistry final {
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //

+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+
              KernelRegistryForTarget<TARGET(kFPGA),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -86,6 +86,7 @@ class Optimizer {
           "npu_subgraph_pass",
           "xpu_subgraph_pass",
           "bm_subgraph_pass",
+           "rknpu_subgraph_pass",
           "static_kernel_pick_pass",        // pick original kernel from graph
           "variable_place_inference_pass",  // inference arg/var's
           // info(target/precision/layout/device)

--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
        X86_DEPS ${x86_kernels}
        ARM_DEPS ${arm_kernels}
        NPU_DEPS ${npu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -43,6 +44,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
    X86_DEPS ${x86_kernels}
    ARM_DEPS ${arm_kernels}
    NPU_DEPS ${npu_kernels}
+    RKNPU_DEPS ${rknpu_kernels}
    XPU_DEPS ${xpu_kernels}
    CL_DEPS ${opencl_kernels}
    FPGA_DEPS ${fpga_kernels}

--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -12,3 +12,4 @@ add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
 add_subdirectory(bm)
+add_subdirectory(rknpu)
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU)
  return()
 endif()


--- a/lite/kernels/rknpu/CMakeLists.txt
+++ b/lite/kernels/rknpu/CMakeLists.txt
+add_subdirectory(bridges)
+add_kernel(subgraph_compute_rknpu RKNPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_rknpu ${rknpu_subgraph_bridges})
--- a/lite/kernels/rknpu/bridges/CMakeLists.txt
+++ b/lite/kernels/rknpu/bridges/CMakeLists.txt
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+
+lite_cc_library(subgraph_bridge_utility_rknpu SRCS utility.cc DEPS ${rknpu_builder_libs} tensor)
+lite_cc_library(subgraph_bridge_graph_rknpu SRCS graph.cc DEPS subgraph_bridge_utility_rknpu)
+
+set(rknpu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_rknpu subgraph_bridge_graph_rknpu)
+
+lite_cc_library(subgraph_bridge_conv_op_rknpu SRCS conv_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_act_op_rknpu SRCS act_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_softmax_op_rknpu SRCS softmax_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_rknpu SRCS pool_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_fc_op_rknpu SRCS fc_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_rknpu SRCS batch_norm_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_rknpu SRCS concat_op.cc DEPS ${rknpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_rknpu SRCS elementwise_ops.cc DEPS ${rknpu_subgraph_bridge_deps})
+
+
+set(rknpu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_rknpu
+        subgraph_bridge_graph_rknpu
+        subgraph_bridge_conv_op_rknpu
+        subgraph_bridge_act_op_rknpu
+        subgraph_bridge_softmax_op_rknpu
+        subgraph_bridge_pool_op_rknpu
+        subgraph_bridge_fc_op_rknpu
+        subgraph_bridge_batch_norm_op_rknpu
+        subgraph_bridge_concat_op_rknpu
+        subgraph_bridge_elementwise_ops_rknpu
+        CACHE INTERNAL "rknpu_subgraph_bridges")
+
+message(STATUS "+++++ rknpu_subgraph_bridges: ${rknpu_subgraph_bridges}")
--- a/lite/kernels/rknpu/bridges/act_op.cc
+++ b/lite/kernels/rknpu/bridges/act_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+// #include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  CHECK_EQ(op_type, "relu");
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_var_name)) {
+    x_node = graph->Get(x_var_name);
+  } else {
+    x_node = graph->Add(x_var_name, *x, x_type->precision(), x_type->layout());
+  }
+
+  auto output_node = graph->Add(
+      output_var_name, *output, out_type->precision(), out_type->layout());
+  auto rGraph = graph->GetHandle();
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  outputs.push_back(output_node->data());
+  auto relu =
+      rGraph->AddOperator(rk::nn::OperatorType::RELU, inputs, outputs, nullptr);
+
+  return SUCCESS;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(relu,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ActConverter);
--- a/lite/kernels/rknpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/rknpu/bridges/batch_norm_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto scale_name = op_info->Input("Scale").front();
+  auto scale_type = kernel->GetInputDeclType("Scale");
+  CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
+  auto scale = scope->FindMutableTensor(scale_name);
+  auto bias_name = op_info->Input("Bias").front();
+  auto bias_type = kernel->GetInputDeclType("Bias");
+  CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+  auto bias = scope->FindMutableTensor(bias_name);
+  auto mean_name = op_info->Input("Mean").front();
+  auto mean_type = kernel->GetInputDeclType("Mean");
+  CHECK(mean_type->layout() == DATALAYOUT(kNCHW));
+  auto mean = scope->FindMutableTensor(mean_name);
+  auto variance_name = op_info->Input("Variance").front();
+  auto variance_type = kernel->GetInputDeclType("Variance");
+  CHECK(variance_type->layout() == DATALAYOUT(kNCHW));
+  auto variance = scope->FindMutableTensor(variance_name);
+  auto y_name = op_info->Output("Y").front();
+  auto y_type = kernel->GetOutputDeclType("Y");
+  auto y = scope->FindMutableTensor(y_name);
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  float momentum = op_info->GetAttr<float>("momentum");
+  float epsilon = op_info->GetAttr<float>("epsilon");
+  int mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
+  bool use_global_stats = op_info->GetAttr<bool>("use_global_stats");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Scale, Bias, Mean, Variance node
+  auto scale_node = graph->Add(scale_name, *scale);
+  auto bias_node = graph->Add(bias_name, *bias);
+  auto mean_node = graph->Add(mean_name, *mean);
+  auto variance_node = graph->Add(variance_name, *variance);
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    y->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(y_name, *y, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  inputs.push_back(mean_node->data());
+  inputs.push_back(variance_node->data());
+  inputs.push_back(scale_node->data());
+  inputs.push_back(bias_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::BatchNormAttr attrs;
+  attrs.eps = epsilon;
+
+  auto rGraph = graph->GetHandle();
+  auto bn = rGraph->AddOperator(
+      rk::nn::OperatorType::BATCH_NORM, inputs, outputs, &attrs);
+
+  return SUCCESS;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::BatchNormConverter);
--- a/lite/kernels/rknpu/bridges/concat_op.cc
+++ b/lite/kernels/rknpu/bridges/concat_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " << op_type << " ... ";
+
+  // Get input and output vars and op attributes
+  auto x_names = op_info->Input("X");
+  auto x_type = kernel->GetInputDeclType("X");
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  auto output = scope->FindMutableTensor(out_name);
+
+  auto axis = op_info->GetAttr<int>("axis");
+  auto num = x_names.size();
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // Traverse all of input nodes which are added into the new created concat
+  // node
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  int idx = 1;
+  for (auto& x_name : x_names) {
+    auto x = scope->FindMutableTensor(x_name);
+    auto x_dims = x->dims();
+    std::shared_ptr<Node> x_node = nullptr;
+    if (graph->Has(x_name)) {
+      x_node = graph->Get(x_name);
+    } else {
+      x_node = graph->Add(x_name, *x);
+      QuantizationInfo qnt;
+      qnt.enable_int8 = enable_int8;
+
+      if (enable_int8) {
+        qnt.quant_bits = bit_length;
+        qnt.scale.push_back(input_scale);
+        x->mutable_data<int8_t>();
+      }
+      x_node =
+          graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
+    }
+
+    inputs.push_back(x_node->data());
+    idx++;
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+  outputs.push_back(output_node->data());
+
+  rk::nn::ConcatAttr attrs;
+  attrs.axis = axis;
+
+  auto rGraph = graph->GetHandle();
+  auto concat = rGraph->AddOperator(
+      rk::nn::OperatorType::CONCAT, inputs, outputs, &attrs);
+
+  return SUCCESS;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ConcatConverter);
--- a/lite/kernels/rknpu/bridges/conv_op.cc
+++ b/lite/kernels/rknpu/bridges/conv_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include <algorithm>
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " << op_type << "... ";
+
+  // Get input and output vars and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+  auto output_name = op_info->Output("Output").front();
+  auto output = scope->FindMutableTensor(output_name);
+  auto output_dims = output->dims();
+  auto bs = input_dims[0];
+  auto ic = input_dims[1];
+  auto oc = filter_dims[0];
+  CHECK_EQ(input_dims.size(), 4L);
+  CHECK_EQ(output_dims.size(), 4L);
+  CHECK_EQ(filter_dims.size(), 4L);
+  CHECK_EQ(output_dims[0], bs);
+  CHECK_EQ(output_dims[1], oc);
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+  // Check depthwise mode
+  bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1);
+  auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // // Input node
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.clear();
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = bit_length;
+    }
+    input_node =
+        graph->Add(input_name, *input, input->precision(), layout, qnt);
+  }
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[NPU] Paddings size should be the same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+  // Filter node
+  std::shared_ptr<Node> filter_node = nullptr;
+  QuantizationInfo filter_qnt;
+
+  filter_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    filter_qnt.scale = weight_scale;
+    filter_qnt.quant_bits = bit_length;
+  }
+
+  filter_node =
+      graph->Add(filter_name, *filter, filter->precision(), layout, filter_qnt);
+
+  // Add bias node if exists bias
+  // Supports the bias nodes with the following dimensions
+  // 0: {oc}
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      auto output_data_size = output_dims.production();
+      std::vector<int64_t> bias_shape;
+
+      if (bias_data_size == oc) {
+        // 0: {oc}
+        bias_shape = {oc};
+      } else {
+        LOG(WARNING)
+            << "[RKNPU] Bias dimension " << bias_dims
+            << " isn't supported in conv2d Op when output dimension is "
+            << output_dims;
+        return FAILED;
+      }
+
+      if (enable_int8) {
+        auto bias_name_qnt = bias_name + "/qnt";
+        auto* bias_qnt = scope->NewTensor(bias_name_qnt);
+
+        bias_qnt->Resize(bias_shape);
+        bias_qnt->set_persistable(true);
+        bias_qnt->set_precision(PrecisionType::kInt32);
+
+        auto* bias_qnt_data = bias_qnt->mutable_data<int32_t>();
+        auto* bias_data = bias->mutable_data<float>();
+
+        QuantizationInfo qnt;
+        qnt.enable_int8 = enable_int8;
+        qnt.quant_bits = 32;
+
+        qnt.scale.resize(weight_scale.size());
+        for (int i = 0; i < weight_scale.size(); i++) {
+          qnt.scale[i] = input_scale * weight_scale[i];
+        }
+
+        auto dtype_max = static_cast<int>((1 << (qnt.quant_bits - 1)) - 1);
+        auto dtype_min = static_cast<int>(0 - dtype_max);
+
+        for (int i = 0; i < oc; i++) {
+          bias_qnt_data[i] =
+              std::min(std::max(static_cast<int>(bias_data[i] / qnt.scale[i]),
+                                dtype_min),
+                       dtype_max);
+        }
+
+        bias_node = graph->Add(
+            bias_name, *bias_qnt, bias_qnt->precision(), layout, qnt);
+      } else {
+        bias_node = graph->Add(bias_name, *bias, bias_shape);
+      }
+    }
+  } else {
+    auto bias_name = filter_name + "/bias/dummy";
+    auto* bias = scope->NewTensor(bias_name);
+    std::vector<int64_t> bias_shape = {oc};
+
+    bias->Resize(bias_shape);
+    bias->set_persistable(true);
+
+    if (enable_int8) {
+      bias->set_precision(PrecisionType::kInt32);
+      auto* bias_data = bias->mutable_data<int32_t>();
+
+      for (int i = 0; i < oc; i++) {
+        bias_data[i] = 0;
+      }
+
+      QuantizationInfo qnt;
+      qnt.enable_int8 = enable_int8;
+      qnt.quant_bits = 32;
+      qnt.scale.resize(weight_scale.size());
+      for (int i = 0; i < weight_scale.size(); i++) {
+        qnt.scale[i] = input_scale * weight_scale[i];
+      }
+
+      bias_node = graph->Add(bias_name, *bias, bias->precision(), layout, qnt);
+    } else {
+      bias->set_precision(PrecisionType::kFloat);
+      auto* bias_data = bias->mutable_data<float>();
+
+      for (int i = 0; i < oc; i++) {
+        bias_data[i] = 0.0;
+      }
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
+    }
+  }
+
+  // Conv node
+  std::shared_ptr<Node> conv_node = nullptr;
+  std::shared_ptr<Node> output_node = nullptr;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(output_name, *output, precision, layout, output_qnt);
+
+  inputs.push_back(input_node->data());
+  inputs.push_back(filter_node->data());
+  inputs.push_back(bias_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::Conv2DAttr attr;
+  attr.ksize[0] = filter_dims[2];
+  attr.ksize[1] = filter_dims[3];
+  attr.stride[0] = strides[0];
+  attr.stride[1] = strides[1];
+  attr.pad[0] = paddings[0];
+  attr.pad[1] = paddings[1];
+  attr.pad[2] = paddings[2];
+  attr.pad[3] = paddings[3];
+  attr.group = groups;
+  attr.weights = oc;
+  attr.dilation[0] = dilations[0];
+  attr.dilation[1] = dilations[1];
+  attr.pad_type = rk::nn::PadType::AUTO;
+  attr.has_relu = fuse_relu;
+
+  if (is_depthwise_mode) {
+    attr.multiplier = 1;
+  } else {
+    attr.multiplier = 0;
+  }
+
+  auto rGraph = graph->GetHandle();
+  auto conv = rGraph->AddOperator(
+      rk::nn::OperatorType::CONV2D, inputs, outputs, &attr, output_name);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ConvConverter);
--- a/lite/kernels/rknpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/rknpu/bridges/elementwise_ops.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+std::vector<int64_t> CvtYShape(const DDim& x_dims,
+                               const DDim& y_dims,
+                               int axis) {
+  CHECK_EQ(x_dims.size(), 4UL) << "[RKNPU] Only support 4-dimension x";
+  CHECK_GE(x_dims.size(), y_dims.size());
+
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+
+  std::vector<int64_t> y_new_shape(y_dims.Vectorize());
+  if (y_new_shape.size() == 4UL) {
+    return y_new_shape;
+  }
+  for (int i = 0; i < axis; i++) {
+    y_new_shape.insert(y_new_shape.begin(), 1);
+  }
+  while (y_new_shape.size() < 4) {
+    y_new_shape.push_back(1);
+  }
+  CHECK_EQ(y_new_shape.size(), 4UL);
+  return y_new_shape;
+}
+
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto y_name = op_info->Input("Y").front();
+  auto y_type = kernel->GetInputDeclType("Y");
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  auto output = scope->FindMutableTensor(out_name);
+  auto axis = op_info->GetAttr<int>("axis");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.clear();
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = op_info->GetAttr<int>("bit_length");
+    }
+    x_node = graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
+  }
+
+  // Y node
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+  } else {
+    // auto y_new_shape = CvtYShape(x_dims, y_dims, axis);
+    // y_node = graph->Add(y_name, *y, y_new_shape);
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.quant_bits = bit_length;
+      qnt.scale.clear();
+      qnt.scale.push_back(input_scale);
+    }
+    y_node = graph->Add(y_name, *y, y_type->precision(), y_type->layout(), qnt);
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.clear();
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(
+      out_name, *output, x_type->precision(), x_type->layout(), output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  inputs.push_back(y_node->data());
+  outputs.push_back(output_node->data());
+
+  auto rGraph = graph->GetHandle();
+
+  // Elementwise node
+  if (op_type == "elementwise_add") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::ADD, inputs, outputs, nullptr);
+  } else if (op_type == "elementwise_sub") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::SUBTRACT, inputs, outputs, nullptr);
+  } else if (op_type == "elementwise_mul") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::MULTIPLY, inputs, outputs, nullptr);
+  } else if (op_type == "elementwise_div") {
+    auto elt_node = rGraph->AddOperator(
+        rk::nn::OperatorType::DIVIDE, inputs, outputs, nullptr);
+  } else {
+    LOG(WARNING) << "[RKNPU] Unsupported op type: " << op_type;
+    return FAILED;
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_div,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::ElementwiseConverter);
--- a/lite/kernels/rknpu/bridges/fc_op.cc
+++ b/lite/kernels/rknpu/bridges/fc_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  auto input_name = op_info->Input("Input").front();
+  auto input_type = kernel->GetInputDeclType("Input");
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  CHECK_GE(input_dims.size(), 2UL);
+  auto w_name = op_info->Input("W").front();
+  auto w_type = kernel->GetInputDeclType("W");
+  auto w = scope->FindMutableTensor(w_name);
+  auto w_dims = w->dims();
+  CHECK_EQ(w_dims.size(), 2UL);
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  auto output = scope->FindMutableTensor(out_name);
+  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  int m = input_dims.Slice(0, in_num_col_dims).production();
+  int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
+  int n = w_dims[1];
+  CHECK_EQ(k * n, w_dims.production());
+  VLOG(3) << "[RKNPU] input dims: " << input_dims << " w dims: " << w_dims
+          << " m: " << m << " k: " << k << " n: " << n;
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // Create input node and reshape it to (m, k, 1, 1)
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+
+  // Create w const node, set its shape to (n, k) and fill with
+  // the transposed w tensor
+  auto* transpose_w = scope->NewTensor(w_name + "/transpose");
+  std::shared_ptr<Node> trans_w_node = nullptr;
+  transpose_w->Resize({n, k});
+  transpose_w->set_persistable(true);
+
+  if (enable_int8) {
+    QuantizationInfo filter_qnt;
+    auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+    filter_qnt.enable_int8 = enable_int8;
+    filter_qnt.scale = weight_scale;
+    filter_qnt.quant_bits = bit_length;
+
+    auto transpose_w_data = transpose_w->mutable_data<int8_t>();
+    auto w_data = w->mutable_data<int8_t>();
+
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j++) {
+        transpose_w_data[j * k + i] = w_data[i * n + j];
+      }
+    }
+    trans_w_node = graph->Add(
+        w_name, *transpose_w, precision, w_type->layout(), filter_qnt);
+  } else {
+    auto transpose_w_data = transpose_w->mutable_data<float>();
+    auto w_data = w->mutable_data<float>();
+
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j++) {
+        transpose_w_data[j * k + i] = w_data[i * n + j];
+      }
+    }
+    trans_w_node =
+        graph->Add(w_name, *transpose_w, precision, w_type->layout());
+  }
+
+  // Add bias node if bias tensor exists
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      std::vector<int64_t> bias_shape = {n};
+
+      VLOG(3) << "[RKNPU] bias precision: "
+              << PrecisionToStr(bias->precision());
+      // We need to quantize bias
+      if (enable_int8) {
+        auto bias_name_qnt = bias_name + "/qnt";
+        auto* bias_qnt = scope->NewTensor(bias_name_qnt);
+        auto weight_scale =
+            op_info->GetAttr<std::vector<float>>("weight_scale");
+
+        bias_qnt->Resize(bias_shape);
+        bias_qnt->set_persistable(true);
+        bias_qnt->set_precision(PrecisionType::kInt32);
+
+        auto* bias_qnt_data = bias_qnt->mutable_data<int32_t>();
+        auto* bias_data = bias->mutable_data<float>();
+
+        QuantizationInfo qnt;
+        qnt.enable_int8 = enable_int8;
+        qnt.quant_bits = 32;
+        qnt.scale.resize(weight_scale.size());
+
+        for (int i = 0; i < weight_scale.size(); i++) {
+          qnt.scale[i] = input_scale * weight_scale[i];
+        }
+
+        auto dtype_max = static_cast<int>((1 << (qnt.quant_bits - 1)) - 1);
+        auto dtype_min = static_cast<int>(0 - dtype_max);
+
+        for (int i = 0; i < n; i++) {
+          bias_qnt_data[i] =
+              std::min(std::max(static_cast<int>(bias_data[i] / qnt.scale[i]),
+                                dtype_min),
+                       dtype_max);
+        }
+
+        bias_node = graph->Add(
+            bias_name, *bias_qnt, bias_qnt->precision(), layout, qnt);
+      } else {
+        bias_node = graph->Add(bias_name, *bias, bias_shape);
+      }
+    }
+  } else {
+    auto bias_name = w_name + "/bias/dummy";
+    auto* bias = scope->NewTensor(bias_name);
+    std::vector<int64_t> bias_shape = {n};
+
+    bias->Resize(bias_shape);
+    bias->set_persistable(true);
+
+    if (enable_int8) {
+      auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+      bias->set_precision(PrecisionType::kInt32);
+      auto* bias_data = bias->mutable_data<int32_t>();
+
+      for (int i = 0; i < n; i++) {
+        bias_data[i] = 0;
+      }
+
+      QuantizationInfo qnt;
+      qnt.enable_int8 = enable_int8;
+      qnt.quant_bits = 32;
+      qnt.scale.resize(weight_scale.size());
+
+      for (int i = 0; i < weight_scale.size(); i++) {
+        qnt.scale[i] = input_scale * weight_scale[i];
+      }
+
+      bias_node = graph->Add(bias_name, *bias, bias->precision(), layout, qnt);
+    } else {
+      bias->set_precision(PrecisionType::kFloat);
+      auto* bias_data = bias->mutable_data<float>();
+
+      for (int i = 0; i < n; i++) {
+        bias_data[i] = 0.0;
+      }
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
+    }
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.clear();
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(input_node->data());
+  inputs.push_back(trans_w_node->data());
+  inputs.push_back(bias_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::FCAttr attrs;
+  attrs.weights = n;
+  attrs.has_relu = false;
+
+  auto rGraph = graph->GetHandle();
+  auto fc = rGraph->AddOperator(
+      rk::nn::OperatorType::FULLCONNECT, inputs, outputs, &attrs);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(fc,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::FCConverter);
--- a/lite/kernels/rknpu/bridges/graph.cc
+++ b/lite/kernels/rknpu/bridges/graph.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include <rknpu/graph.h>
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    // Only variable node can be shared with the same name
+    if (!node->is_var() || !it->second.back()->is_var()) {
+      LOG(FATAL) << "[RKNPU] Const or data node " << name << " is redefined.";
+      return -1;
+    }
+  } else {
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  it->second.push_back(node);
+  return it->second.size();
+}
+
+// Const or data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const Tensor& tensor,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout,
+                                 const QuantizationInfo& qnt) {
+  std::shared_ptr<Node> node = nullptr;
+
+  if (precision == PrecisionType::kUnk) {
+    precision = tensor.precision();  // todo
+  }
+
+  if (precision == PrecisionType::kUnk) {
+    if (qnt.enable_int8 && qnt.quant_bits == 8) {
+      precision = PrecisionType::kInt8;
+    } else if (!qnt.enable_int8) {
+      precision = PrecisionType::kFloat;
+    } else {
+      LOG(ERROR) << "[rknpu]:Graph:: tensor precision unknown!";
+    }
+  }
+
+  if (precision != tensor.precision()) {
+    LOG(INFO) << "[rknpu]:Graph::Add: tensor precision mismatch!" << name << ":"
+              << PrecisionToStr(precision) << " vs "
+              << PrecisionToStr(tensor.precision());
+  }
+
+  if (tensor.persistable()) {
+    // Const node
+    node = std::make_shared<Node>(precision, layout, Node::Role::kConst);
+    auto idx = Add(name, node);
+    CHECK_EQ(idx, 1);
+    auto attr = std::make_shared<rk::nn::TensorAttr>();
+    attr->precision = ToRknpuPrecisionType(precision);
+    attr->layout = ToRknpuDataLayoutType(layout);
+    attr->role = rk::nn::TensorRole::CONST;
+    attr->name = name;
+
+    switch (precision) {
+      case PrecisionType::kInt8:
+        attr->qntBits = 8;
+        attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+        attr->qntParamSymmetric.scale = qnt.scale;
+        break;
+      case PrecisionType::kInt32:
+        attr->qntBits = 32;
+        attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+        attr->qntParamSymmetric.scale = qnt.scale;
+        break;
+      default:
+        break;
+    }
+
+    attr->dims.resize(shape.size());
+    for (int i = 0; i < shape.size(); i++) {
+      attr->dims[i] = shape[i];
+    }
+
+    LOG(INFO) << "[rknpu]:Graph::Add const node:" << name
+              << " precision: " << PrecisionToStr(precision)
+              << " layout: " << DataLayoutToStr(layout);
+    node->set_data(
+        rgraph_->CreateTensor(attr, const_cast<void*>(tensor.raw_data())));
+  } else {
+    // Data node
+    node = Add(name, shape, precision, layout, qnt);
+  }
+  return node;
+}
+
+// Data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout,
+                                 const QuantizationInfo& qnt) {
+  auto node = std::make_shared<Node>(precision, layout, Node::Role::kData);
+  auto idx = Add(name, node);
+  CHECK_EQ(idx, 1);
+  auto attr = std::make_shared<rk::nn::TensorAttr>();
+  attr->precision = ToRknpuPrecisionType(precision);
+  attr->layout = ToRknpuDataLayoutType(layout);
+  attr->role = rk::nn::TensorRole::VAR;
+  attr->name = name;
+
+  switch (precision) {
+    case PrecisionType::kInt8:
+      attr->qntBits = 8;
+      attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+      attr->qntParamSymmetric.scale = qnt.scale;
+      break;
+    case PrecisionType::kInt32:
+      attr->qntBits = 32;
+      attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
+      attr->qntParamSymmetric.scale = qnt.scale;
+      break;
+
+    default:
+      break;
+  }
+
+  attr->dims.resize(shape.size());
+  for (int i = 0; i < shape.size(); i++) {
+    attr->dims[i] = shape[i];
+  }
+
+  LOG(INFO) << "[rknpu]:Graph::Add data node:" << name
+            << " precision: " << PrecisionToStr(precision)
+            << " layout: " << DataLayoutToStr(layout);
+  node->set_data(rgraph_->CreateTensor(attr, nullptr));  // todo
+  return node;
+}
+
+Graph::Graph() {
+  rgraph_ = new rk::nn::Graph();
+  CHECK(rgraph_ != nullptr);
+}
+
+Graph::~Graph() {}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/rknpu/bridges/graph.h
+++ b/lite/kernels/rknpu/bridges/graph.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+// Graph and node is defined to collect all of converted RKNPU IR nodes
+struct QuantizationInfo {
+  int enable_int8;
+  int quant_bits;
+  std::vector<float> scale;
+};
+
+class Node {
+ public:
+  enum class Role {
+    kVar = 0,
+    kConst,
+    kData,
+  };
+
+  Node(std::shared_ptr<rk::nn::Tensor> data,
+       PrecisionType precision,
+       DataLayoutType layout,
+       Role role)
+      : data_(data), precision_(precision), layout_(layout), role_(role) {}
+  Node(PrecisionType precision, DataLayoutType layout, Role role)
+      : precision_(precision), layout_(layout), role_(role) {}
+
+  void set_data(std::shared_ptr<rk::nn::Tensor> data) { data_ = data; }
+  void set_precision(PrecisionType precision) { precision_ = precision; }
+  void set_layout(DataLayoutType layout) { layout_ = layout; }
+  void set_role(Role role) { role_ = role; }
+  void set_quant_param(const QuantizationInfo& qnt) { qnt_ = qnt; }
+
+  std::shared_ptr<rk::nn::Tensor> data() { return data_; }
+  PrecisionType precision() const { return precision_; }
+  DataLayoutType layout() const { return layout_; }
+  Role role() const { return role_; }
+  bool is_var() const { return role_ == Role::kVar; }
+  bool is_const() const { return role_ == Role::kConst; }
+  bool is_data() const { return role_ == Role::kData; }
+
+ private:
+  std::shared_ptr<rk::nn::Tensor> data_{nullptr};
+  PrecisionType precision_{PRECISION(kFloat)};
+  DataLayoutType layout_{DATALAYOUT(kNCHW)};
+  Role role_{Role::kVar};
+  QuantizationInfo qnt_;
+};
+
+class Graph {
+ public:
+  Graph();
+  ~Graph();
+
+ public:
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kUnk),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo());
+  std::shared_ptr<Node> Get(const std::string& name) {
+    CHECK(Has(name)) << "[RKNPU] Node " << name << " not found.";
+    return nodes_.at(name).back();
+  }
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            PrecisionType precision = PRECISION(kUnk),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo()) {
+    return Add(name, tensor, tensor.dims().Vectorize(), precision, layout, qnt);
+  }
+
+  // Data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo());
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            DDim dims,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW),
+                            const QuantizationInfo& qnt = QuantizationInfo()) {
+    return Add(name, dims.Vectorize(), precision, layout, qnt);
+  }
+
+  bool Has(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+  rk::nn::Graph* GetHandle() { return rgraph_; }
+
+ private:
+  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
+  rk::nn::Graph* rgraph_;
+};
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/rknpu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/rknpu/bridges/paddle_use_bridges.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(relu, kRKNPU);
+USE_SUBGRAPH_BRIDGE(conv2d, kRKNPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kRKNPU);
+
+USE_SUBGRAPH_BRIDGE(pool2d, kRKNPU);
+USE_SUBGRAPH_BRIDGE(fc, kRKNPU);
+USE_SUBGRAPH_BRIDGE(softmax, kRKNPU);
+USE_SUBGRAPH_BRIDGE(batch_norm, kRKNPU);
+USE_SUBGRAPH_BRIDGE(concat, kRKNPU);
+
+USE_SUBGRAPH_BRIDGE(elementwise_add, kRKNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kRKNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kRKNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_div, kRKNPU);
--- a/lite/kernels/rknpu/bridges/pool_op.cc
+++ b/lite/kernels/rknpu/bridges/pool_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pool_op.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto out_name = op_info->Output("Out").front();
+  auto output = scope->FindMutableTensor(out_name);
+
+  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (x->precision() == PRECISION(kInt8)) {
+    // enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    enable_int8 = true;
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+      LOG(WARNING) << "[RKNPU] Pooling int8";
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = bit_length;
+    }
+    x_node = graph->Add(x_name, *x, x->precision(), layout, qnt);
+  }
+
+  // pool mode
+  rk::nn::PoolType mode = rk::nn::PoolType::POOLING_UNKNOWN;
+  if (pooling_type == "max") {
+    mode = rk::nn::PoolType::POOLING_MAX;
+  } else if (pooling_type == "avg") {
+    mode = rk::nn::PoolType::POOLING_AVG;
+  } else {
+    LOG(WARNING) << "[RKNPU] Unsupported pooling type: " << pooling_type;
+    return FAILED;
+  }
+
+  // pad mode
+  rk::nn::PadType pad_mode = rk::nn::PadType::AUTO;
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  if (padding_algorithm == "SAME") {
+    pad_mode = rk::nn::PadType::SAME;
+  } else if (padding_algorithm == "VALID") {
+    pad_mode = rk::nn::PadType::VALID;
+  }
+
+  // paddings and strides
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[NPU] Paddings size should be the same or twice as the inputs size.";
+
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  lite::operators::UpdatePadding(&paddings,
+                                 global_pooling,
+                                 adaptive,
+                                 padding_algorithm,
+                                 x->dims(),
+                                 strides,
+                                 ksize);
+
+  // ceil mode
+  int ceil_mode = 0;
+  if (op_info->HasAttr("ceil_mode")) {
+    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::PoolAttr attrs;
+  attrs.ksize[0] = ksize[0];
+  attrs.ksize[1] = ksize[1];
+  attrs.stride[0] = strides[0];
+  attrs.stride[1] = strides[1];
+  attrs.pad[0] = paddings[0];
+  attrs.pad[1] = paddings[1];
+  attrs.pad[2] = paddings[2];
+  attrs.pad[3] = paddings[3];
+  attrs.pad_type = pad_mode;
+  attrs.pool_type = mode;
+  attrs.global_pooling = global_pooling;
+
+  if (ceil_mode) {
+    attrs.round_type = rk::nn::RoundType::ROUND_CEIL;
+  } else {
+    attrs.round_type = rk::nn::RoundType::ROUND_FLOOR;
+  }
+
+  auto rGraph = graph->GetHandle();
+  auto pool =
+      rGraph->AddOperator(rk::nn::OperatorType::POOL, inputs, outputs, &attrs);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::PoolConverter);
--- a/lite/kernels/rknpu/bridges/softmax_op.cc
+++ b/lite/kernels/rknpu/bridges/softmax_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto x_rank = x_dims.size();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto output = scope->FindMutableTensor(out_name);
+  auto axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis += x_rank;
+  }
+
+  // for quantization
+  bool enable_int8 = false;
+  float input_scale = 1.0;
+  float output_scale = 1.0;
+  int bit_length = 8;
+  DataLayoutType layout = DATALAYOUT(kNCHW);
+  PrecisionType precision = PRECISION(kFloat);
+
+  if (op_info->HasAttr("enable_int8")) {
+    enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    input_scale = op_info->GetAttr<float>("input_scale");
+    bit_length = op_info->GetAttr<int>("bit_length");
+    output_scale = op_info->GetAttr<float>("output_scale");
+
+    if (enable_int8) {
+      precision = PRECISION(kInt8);
+    }
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    QuantizationInfo qnt;
+    qnt.enable_int8 = enable_int8;
+
+    if (enable_int8) {
+      qnt.scale.push_back(input_scale);
+      qnt.quant_bits = bit_length;
+    }
+    x_node = graph->Add(x_name, *x, precision, layout, qnt);
+  }
+
+  std::shared_ptr<Node> output_node = nullptr;
+  QuantizationInfo output_qnt;
+
+  output_qnt.enable_int8 = enable_int8;
+
+  if (enable_int8) {
+    output_qnt.quant_bits = bit_length;
+    output_qnt.scale.push_back(output_scale);
+    output->mutable_data<int8_t>();
+  }
+
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+
+  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+
+  inputs.push_back(x_node->data());
+  outputs.push_back(output_node->data());
+
+  rk::nn::SoftmaxAttr attrs;
+  attrs.axis = axis;
+  attrs.beta = 1.0;
+
+  auto rGraph = graph->GetHandle();
+  auto softmax = rGraph->AddOperator(
+      rk::nn::OperatorType::SOFTMAX, inputs, outputs, &attrs);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kRKNPU,
+                         paddle::lite::subgraph::rknpu::SoftmaxConverter);
--- a/lite/kernels/rknpu/bridges/utility.cc
+++ b/lite/kernels/rknpu/bridges/utility.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/rknpu/bridges/utility.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+rk::nn::PrecisionType ToRknpuPrecisionType(PrecisionType precision) {
+  rk::nn::PrecisionType t = rk::nn::PrecisionType::UNKNOWN;
+
+  switch (precision) {
+    case PrecisionType::kFloat:
+      t = rk::nn::PrecisionType::FLOAT32;
+      break;
+    case PrecisionType::kFP16:
+      t = rk::nn::PrecisionType::FLOAT16;
+      break;
+    case PrecisionType::kInt16:
+      t = rk::nn::PrecisionType::INT16;
+      break;
+    case PrecisionType::kInt32:
+      t = rk::nn::PrecisionType::INT32;
+      break;
+    case PrecisionType::kInt64:
+      t = rk::nn::PrecisionType::INT64;
+      break;
+    case PrecisionType::kInt8:
+      t = rk::nn::PrecisionType::INT8;
+      break;
+    case PrecisionType::kBool:
+      t = rk::nn::PrecisionType::BOOL8;
+      break;
+    default:
+      break;
+  }
+
+  return t;
+}
+
+rk::nn::DataLayoutType ToRknpuDataLayoutType(DataLayoutType layout) {
+  rk::nn::DataLayoutType t = rk::nn::DataLayoutType::UNKNOWN;
+
+  switch (layout) {
+    case DataLayoutType::kNCHW:
+      t = rk::nn::DataLayoutType::NCHW;
+      break;
+    case DataLayoutType::kNHWC:
+      t = rk::nn::DataLayoutType::NHWC;
+      break;
+    default:
+      break;
+  }
+
+  return t;
+}
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/rknpu/bridges/utility.h
+++ b/lite/kernels/rknpu/bridges/utility.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace rknpu {
+
+rk::nn::PrecisionType ToRknpuPrecisionType(PrecisionType precision);
+rk::nn::DataLayoutType ToRknpuDataLayoutType(DataLayoutType layout);
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+}  // namespace rknpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/rknpu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/backends/rknpu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/rknpu/bridges/graph.h"
+#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h"
+#include "lite/kernels/rknpu/bridges/utility.h"
+#include "rknpu/rknpu_pub.h"  // NOLINT
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace rknpu {
+
+int SubgraphEngine::BuildDeviceProgram() {
+  LOG(INFO) << "[RKNPU]:BuildDeviceProgram";
+  int status = 0;
+  // Convert all of ops and their input vars and weights and added into the NPU
+  // RKNPU IR graph
+  subgraph::rknpu::Graph graph;
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = const_cast<OpLite*>(inst.op());
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kRKNPU))) {
+      return subgraph::FAILED;
+    }
+    auto kernel = inst.kernel();
+    status |= bridges.Select(op_type, TARGET(kRKNPU))(
+        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  // Collect the valid input and output nodes in the RKNPU IR graph and update
+  // the input and output names
+  device_inames_.clear();
+  device_onames_.clear();
+
+  for (auto& input_name : input_names_) {
+    LOG(INFO) << "[RKNPU] Input node " << input_name;
+    if (graph.Has(input_name)) {
+      LOG(INFO) << input_name << " Precision "
+                << PrecisionToStr(graph.Get(input_name)->precision());
+      device_itensors_.push_back(graph.Get(input_name)->data());
+      device_inames_.push_back(input_name);
+    } else {
+      LOG(WARNING) << "[RKNPU] Input node " << input_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+
+  for (auto& output_name : output_names_) {
+    LOG(INFO) << "[RKNPU] Output node " << output_name;
+    if (graph.Has(output_name)) {
+      auto tensor = scope_->FindMutableTensor(output_name);
+      LOG(INFO) << output_name << " Precision "
+                << PrecisionToStr(tensor->precision());
+      device_otensors_.push_back(graph.Get(output_name)->data());
+      device_onames_.push_back(output_name);
+    } else {
+      LOG(WARNING) << "[RKNPU] Output node " << output_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+  CHECK(!device_inames_.empty())
+      << "[RKNPU] No input nodes found for building NPU model";
+  CHECK(!device_onames_.empty())
+      << "[RKNPU] No output nodes found for building NPU model";
+
+  device_program_ = lite::rknpu::Device::Global().Build(
+      model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[RKNPU] Build model failed!";
+    return subgraph::FAILED;
+  }
+
+  // input
+  origin_idims_.resize(input_names_.size());
+  origin_itensors_.resize(input_names_.size());
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+  }
+  // output
+  origin_odims_.resize(output_names_.size());
+  origin_otensors_.resize(output_names_.size());
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+
+    auto output_dims = origin_otensors_[i]->dims();
+  }
+
+  origin_idims_.resize(device_inames_.size());
+  origin_itensors_.resize(device_inames_.size());
+  device_itensors_.resize(device_inames_.size());
+  origin_odims_.resize(device_onames_.size());
+  origin_otensors_.resize(device_onames_.size());
+  device_otensors_.resize(device_onames_.size());
+  for (int i = 0; i < device_inames_.size(); i++) {
+    auto node = graph.Get(device_inames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+
+    LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << device_inames_[i]
+              << " precision: " << PrecisionToStr(precision)
+              << " layout: " << DataLayoutToStr(layout);
+  }
+  for (int i = 0; i < device_onames_.size(); i++) {
+    auto node = graph.Get(device_onames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << device_onames_[i]
+              << " precision: " << PrecisionToStr(precision)
+              << " layout: " << DataLayoutToStr(layout);
+    // Prepare the device output tensors
+    switch (precision) {
+      case PRECISION(kFloat):
+        origin_otensors_[i]->mutable_data<float>();
+        break;
+      case PRECISION(kInt8):
+        origin_otensors_[i]->mutable_data<int8_t>();
+        break;
+      case PRECISION(kInt16):
+        origin_otensors_[i]->mutable_data<int16_t>();
+        break;
+      case PRECISION(kInt32):
+        origin_otensors_[i]->mutable_data<int32_t>();
+        break;
+      case PRECISION(kInt64):
+        origin_otensors_[i]->mutable_data<int64_t>();
+        break;
+      default:
+        LOG(FATAL) << "[RKNPU] " << device_onames_[i]
+                   << " can't mutable data with precision type "
+                   << PrecisionToStr(precision);
+        break;
+    }
+  }
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  LOG(INFO) << "[RKNPU]:LaunchDeviceProgram";
+  std::vector<rk::nn::InputInfo> inputs;
+  std::vector<rk::nn::OutputInfo> outputs;
+
+  inputs.resize(device_itensors_.size());
+  for (size_t i = 0; i < device_itensors_.size(); i++) {
+    inputs[i].index = i;
+    inputs[i].buf = const_cast<void*>(origin_itensors_[i]->raw_data());
+    inputs[i].size = origin_itensors_[i]->memory_size();
+    inputs[i].pass_through = false;
+    inputs[i].type =
+        subgraph::rknpu::ToRknpuPrecisionType(origin_itensors_[i]->precision());
+    inputs[i].layout = rk::nn::DataLayoutType::NCHW;
+  }
+
+  outputs.resize(device_otensors_.size());
+  for (size_t i = 0; i < device_otensors_.size(); i++) {
+    outputs[i].index = i;
+    outputs[i].buf = const_cast<void*>(origin_otensors_[i]->raw_data());
+    outputs[i].size = origin_otensors_[i]->memory_size();
+    outputs[i].want_float = false;
+  }
+
+  device_program_->SetInputs(inputs);
+  device_program_->Run();
+  device_program_->GetOutputs(outputs);
+  return 0;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  LOG(INFO) << "[RKNPU]:PrepareForRun";
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  LOG(INFO) << "[RKNPU]:Run";
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace rknpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kRKNPU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::rknpu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/program.h"
+#include "lite/core/types.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "rknpu/rknpu_pub.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace rknpu {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+
+  std::string model_name_;
+  std::vector<std::string> device_inames_;
+  std::vector<std::string> device_onames_;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_;
+  std::unique_ptr<rk::nn::Exection> device_program_{nullptr};
+};
+
+class SubgraphCompute
+    : public KernelLite<TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace rknpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
@@ -12,3 +12,10 @@ if(LITE_WITH_XPU)
      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
 endif()
+
+if(LITE_WITH_RKNPU)
+    lite_cc_test(test_mobilenetv1_int8_rknpu SRCS test_mobilenetv1_int8_rknpu.cc
+      DEPS ${lite_model_test_DEPS} paddle_api_full
+      RKNPU_DEPS ${rknpu_kernels} ${rknpu_bridges}
+      ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
+endif()
--- a/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/time.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+inline int64_t ShapeProduction(std::vector<int64_t> shape) {
+  int64_t s = 1;
+  for (int64_t dim : shape) {
+    s *= dim;
+  }
+  return s;
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    std::cerr << "[ERROR] usage: ./" << argv[0]
+              << " model_dir [thread_num] [warmup_times] [repeat_times] "
+                 "[input_data_path] [output_data_path]"
+              << std::endl;
+    return -1;
+  }
+  std::string model_dir = argv[1];
+  int thread_num = 1;
+  if (argc > 2) {
+    thread_num = atoi(argv[2]);
+  }
+  int warmup_times = 5;
+  if (argc > 3) {
+    warmup_times = atoi(argv[3]);
+  }
+  int repeat_times = 10;
+  if (argc > 4) {
+    repeat_times = atoi(argv[4]);
+  }
+  std::string input_data_path;
+  if (argc > 5) {
+    input_data_path = argv[5];
+  }
+  std::string output_data_path;
+  if (argc > 6) {
+    output_data_path = argv[6];
+  }
+  paddle::lite_api::CxxConfig config;
+  config.set_model_dir(model_dir);
+  config.set_threads(thread_num);
+  config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH);
+  config.set_valid_places(
+      {paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
+       paddle::lite_api::Place{
+           TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}});
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  std::unique_ptr<paddle::lite_api::Tensor> input_tensor(
+      std::move(predictor->GetInput(0)));
+  input_tensor->Resize({1, 3, 224, 224});
+  auto input_data = input_tensor->mutable_data<float>();
+  auto input_size = ShapeProduction(input_tensor->shape());
+  if (input_data_path.empty()) {
+    for (int i = 0; i < input_size; i++) {
+      input_data[i] = 1;
+    }
+  } else {
+    std::fstream fs(input_data_path, std::ios::in);
+    if (!fs.is_open()) {
+      std::cerr << "open input data file failed." << std::endl;
+      return -1;
+    }
+    for (int i = 0; i < input_size; i++) {
+      fs >> input_data[i];
+    }
+  }
+
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < repeat_times; ++i) {
+    predictor->Run();
+  }
+
+  std::cout << "Model: " << model_dir << ", threads num " << thread_num
+            << ", warmup times: " << warmup_times
+            << ", repeat times: " << repeat_times << ", spend "
+            << (GetCurrentUS() - start) / repeat_times / 1000.0
+            << " ms in average." << std::endl;
+
+  std::unique_ptr<const paddle::lite_api::Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto output_data = output_tensor->data<float>();
+  auto output_size = ShapeProduction(output_tensor->shape());
+  std::cout << "output data:";
+  for (int i = 0; i < output_size; i += 100) {
+    std::cout << "[" << i << "] " << output_data[i] << std::endl;
+  }
+  return 0;
+}
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_RKNPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
    lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -27,6 +27,8 @@ NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.hua
 BUILD_XPU=OFF
 BUILD_XTCL=OFF
 XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/"
+BUILD_RKNPU=OFF
+RKNPU_DDK_ROOT="$(pwd)/rknpu/"
 LITE_WITH_ARM_LANG=OFF

 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
@@ -141,6 +143,8 @@ function make_tiny_publish_so {
      -DLITE_WITH_XPU=$BUILD_XPU \
      -DLITE_WITH_XTCL=$BUILD_XTCL \
      -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
+      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}

  make publish_inference -j$NUM_PROC
@@ -230,6 +234,8 @@ function make_full_publish_so {
      -DLITE_WITH_XPU=$BUILD_XPU \
      -DLITE_WITH_XTCL=$BUILD_XTCL \
      -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
+      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
      -DLITE_WITH_TRAIN=$BUILD_TRAIN \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}

@@ -265,6 +271,8 @@ function make_all_tests {
      -DLITE_WITH_XPU=$BUILD_XPU \
      -DLITE_WITH_XTCL=$BUILD_XTCL \
      -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_RKNPU=$BUILD_RKNPU \
+      -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}

  make lite_compile_deps -j$NUM_PROC
@@ -498,6 +506,14 @@ function main {
                XPU_SDK_ROOT="${i#*=}"
                shift
                ;;
+            --build_rknpu=*)
+                BUILD_RKNPU="${i#*=}"
+                shift
+                ;;
+            --rknpu_ddk_root=*)
+                RKNPU_DDK_ROOT="${i#*=}"
+                shift
+                ;;
            tiny_publish)
                make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL 
                shift

--- a/lite/tools/build_rknpu.sh
+++ b/lite/tools/build_rknpu.sh
+#!/bin/bash
+set -ex
+
+# global variables with default value
+ARM_OS="armlinux"                    # android only yet
+ARM_ABI="armv8"                     # armv8, armv7
+ARM_LANG="gcc"                      # gcc only yet
+DDK_ROOT="$(pwd)/rknpu"       
+TARGET_NAME="test_subgraph_pass"    # default target
+BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
+WITH_TESTING=ON 	            # ON/OFF
+SHUTDOWN_LOG=OFF                    # ON(disable logging)/OFF
+ON_TINY_PUBLISH=OFF                 # ON(tiny publish)/OFF(full publish)
+
+function print_usage {
+    echo -e "\nUSAGE:"
+    echo
+    echo "----------------------------------------"
+    echo -e "--arm_os=<os> android only yet."
+    echo -e "--arm_abi=<abi> armv8, armv7 yet."
+    echo -e "--arm_lang=<gcc>"
+    echo -e "--ddk_root=<hiai_ddk_root>"
+    echo -e "--target_name=<target_name>"
+    echo "----------------------------------------"
+    echo
+}
+
+# for code gen, a source file is generated after a test, 
+# but is dependended by some targets in cmake.
+# here we fake an empty file to make cmake works.
+function prepare_workspace {
+    # in build directory
+    # 1. Prepare gen_code file
+    GEN_CODE_PATH_PREFIX=lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    # 2.Prepare debug tool
+    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
+    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
+    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
+}
+
+function prepare_thirdparty {
+    readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+
+    readonly workspace=$PWD
+    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+        rm -rf $workspace/third-party
+
+         if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+            wget $THIRDPARTY_TAR
+        fi
+        tar xzf third-party-05b862.tar.gz
+    else
+        git submodule update --init --recursive
+    fi
+}
+
+function build_npu {
+    cur_dir=$(pwd)
+
+    prepare_thirdparty
+
+    local publish_dir
+    if [[ "${ON_TINY_PUBLISH}" == "ON" ]]; then
+        WITH_TESTING=OFF
+        SHUTDOWN_LOG=ON
+        publish_dir="tiny_publish"
+    else
+        publish_dir="full_publish"
+    fi
+    build_dir=$cur_dir/build.lite.rknpu.${ARM_OS}.${ARM_ABI}.${ARM_LANG}.${publish_dir}
+    mkdir -p $build_dir
+    cd $build_dir
+
+    # NPU libs need API LEVEL 24 above
+    prepare_workspace
+    cmake .. \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_LITE=ON \
+        -DLITE_WITH_CUDA=OFF \
+        -DLITE_WITH_X86=OFF \
+        -DLITE_WITH_NPU=OFF \
+        -DLITE_WITH_JAVA=OFF \
+        -DLITE_WITH_ARM=ON \
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON	\
+        -DWITH_ARM_DOTPROD=ON   \
+        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
+        -DWITH_TESTING=${WITH_TESTING} \
+        -DLITE_SHUTDOWN_LOG=${SHUTDOWN_LOG} \
+        -DLITE_ON_TINY_PUBLISH=${ON_TINY_PUBLISH} \
+        -DARM_TARGET_OS=${ARM_OS} \
+        -DARM_TARGET_ARCH_ABI=${ARM_ABI} \
+        -DARM_TARGET_LANG=${ARM_LANG} \
+        -DLITE_WITH_RKNPU=ON \
+        -DRKNPU_DDK_ROOT=${DDK_ROOT}
+
+    make $TARGET_NAME -j2
+
+    cd -
+    echo "Done"
+}
+
+function main {
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            --target_name=*)
+                TARGET_NAME="${i#*=}"
+                shift
+                ;;
+            --arm_os=*)
+                ARM_OS="${i#*=}"
+                shift
+                ;;
+            --arm_abi=*)
+                ARM_ABI="${i#*=}"
+                shift
+                ;;
+            --arm_lang=*)
+                ARM_LANG="${i#*=}"
+                shift
+                ;;
+            --android_stl=*)
+                ANDROID_STL="${i#*=}"
+                shift
+                ;;
+            --build_extra=*)
+                BUILD_EXTRA="${i#*=}"
+                shift
+                ;;
+            --ddk_root=*)
+                DDK_ROOT="${i#*=}"
+                shift
+                ;;
+            build)
+                build_npu
+                shift
+                ;;
+            full_publish)
+                TARGET_NAME=publish_inference
+                build_npu
+                shift
+                ;;
+            tiny_publish)
+                ON_TINY_PUBLISH=ON
+                TARGET_NAME=publish_inference
+                build_npu
+                shift
+                ;;
+            *)
+                # unknown option
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+}
+
+main $@
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -56,8 +56,8 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
 ops_lines = []

 # valid targets and valid_ops
-valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"]
-valid_ops = [[], [], [], [], [], [], [], [], [], []]
+valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU"]
+valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[]]
 class TargetType:
    kUnk = 0
    kHost = 1
@@ -68,6 +68,9 @@ class TargetType:
    kFPGA = 7
    kNPU = 8
    kXPU = 9
+    kBM = 10
+    kMLU = 11
+    kRKNPU = 12
    kAny = 6  # any target

 # record op_info of valid kernels into `valid_ops` according to different target type