add subgraph compute and backends

8278f114 · yanghongtian · 841061b1 · 8278f114 · 8278f114 · 8278f114
22 changed file
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -153,12 +153,12 @@ function(lite_cc_library TARGET)
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
-            HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
-            MLU_DEPS ${args_MLU_DEPS}
-            )
+            # MLU_DEPS ${args_MLU_DEPS}
+            HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
+      )

    if (args_SHARED OR ARGS_shared)
        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} SHARED)
@@ -204,7 +204,7 @@ function(lite_cc_binary TARGET)
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            CV_DEPS ${CV_DEPS}
-            MLU_DEPS ${args_MLU_DEPS}
+            # MLU_DEPS ${args_MLU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -261,7 +261,7 @@ function(lite_cc_test TARGET)
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
              CV_DEPS ${args_CV_DEPS}
-              MLU_DEPS ${args_MLU_DEPS}
+              # MLU_DEPS ${args_MLU_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -443,8 +443,8 @@ function(add_kernel TARGET device level)
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
              HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
-	          BM_DEPS ${args_BM_DEPS}
-              MLU_DEPS ${args_MLU_DEPS}
+	            BM_DEPS ${args_BM_DEPS}
+              #MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -463,7 +463,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -498,8 +498,8 @@ function(add_operator TARGET level)
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
              HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
-	          BM_DEPS ${args_BM_DEPS}
-              MLU_DEPS ${args_MLU_DEPS}
+	            BM_DEPS ${args_BM_DEPS}
+              #MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -68,7 +68,7 @@ if (WITH_TESTING)
      X86_DEPS ${x86_kernels}
      XPU_DEPS ${xpu_kernels}
      BM_DEPS ${bm_kernels}
-      HW_ASCENND_NPU_DEPS ${hw_ascend_npu_kernels}
+      HWAscendNPU_DEPS ${hw_ascend_npu_kernels}
      MLU_DEPS ${mlu_kernels})
 endif()
 if(LITE_WITH_FPGA)
@@ -110,7 +110,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        BM_DEPS ${bm_kernels}
                        CL_DEPS ${opencl_kernels}
                        FPGA_DEPS ${fpga_kernels}
-                        HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels})
+                        HWAscendNPU_DEPS ${hw_ascend_npu_kernels})
 endif()

 # for light api
@@ -132,8 +132,7 @@ lite_cc_library(light_api SRCS light_api.cc
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
-        HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
-        MLU_DEPS ${mlu_kernels})
+        HWAscendNPU_DEPS ${hw_ascend_npu_kernels})

 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -152,8 +151,7 @@ if(WITH_TESTING)
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels}
       BM_DEPS ${bm_kernels}
-       HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
-       MLU_DEPS ${mlu_kernels}
+       HWAscendNPU_DEPS ${hw_ascend_npu_kernels}
       EXCLUDE_COMPILE_DEPS "ON"
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -301,6 +299,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
        MLU_DEPS ${mlu_kernels}
+        HWAscendNPU_DEPS ${hw_ascend_npu_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
        --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)

@@ -338,8 +337,8 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
  X86_DEPS ${x86_kernels}
  FPGA_DEPS ${fpga_kernels}
  BM_DEPS ${bm_kernels}
-  HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
  MLU_DEPS ${mlu_kernels}
+  HWAscendNPU_DEPS ${hw_ascend_npu_kernels}
  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
 if (WITH_TESTING)
    add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -8,3 +8,4 @@ add_subdirectory(npu)
 add_subdirectory(xpu)
 add_subdirectory(mlu)
 add_subdirectory(bm)
+add_subdirectory(hw_ascend_npu)
--- a/lite/backends/hw_ascend_npu/CMakeLists.txt
+++ b/lite/backends/hw_ascend_npu/CMakeLists.txt
+if(NOT LITE_WITH_HW_ASCEND_NPU)
+  return()
+endif()
+lite_cc_library(runtime_hw_ascend_npu SRCS runtime.cc DEPS
+  ${hw_ascend_npu_builder_libs}
+  ${hw_ascend_npu_runtime_libs})
+lite_cc_library(target_wrapper_hw_ascend_npu SRCS target_wrapper.cc DEPS
+  ${hw_ascend_npu_builder_libs}
+  ${hw_ascend_npu_runtime_libs})
+lite_cc_library(device_hw_ascend_npu SRCS device.cc DEPS
+  ${hw_ascend_npu_builder_libs}
+  ${hw_ascend_npu_runtime_libs}
+  target_wrapper_hw_ascend_npu
+  runtime_hw_ascend_npu)
--- a/lite/backends/hw_ascend_npu/device.cc
+++ b/lite/backends/hw_ascend_npu/device.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/hw_ascend_npu/device.h"
+#include <map>
+#include <string>
+#include "ge/ge_api_types.h"
+#include "lite/backends/hw_ascend_npu/runtime.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace hw_ascend_npu {
+std::shared_ptr<HWAscendNPURuntime> Device::Build(
+    std::vector<ge::Operator>& input_nodes,  // NOLINT
+    std::vector<ge::Operator>& output_nodes  // NOLINT
+    ) {
+  VLOG(3) << "[HWAscendNPU] Build model";
+  // Build the IR graph to the om model
+  ge::Graph ir_graph("graph");
+  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
+  ge::ModelBufferData model;
+
+  std::map<std::string, std::string> build_options;
+  build_options.insert({ge::ir_option::EXEC_DISABLE_REUSED_MEMORY, "1"});
+
+  ge::graphStatus ret = aclgrphBuildModel(ir_graph, build_options, model);
+
+  if (ret != ge::GRAPH_SUCCESS) {
+    LOG(ERROR) << "[HWAscendNPU] Build model failed, error code: " << ret;
+    return nullptr;
+  }
+
+  std::shared_ptr<HWAscendNPURuntime> model_runtime(
+      new HWAscendNPURuntime(model.data, model.length));
+  CHECK(model_runtime != nullptr);
+  if (!model_runtime->model_loaded()) {
+    LOG(ERROR) << "[HWAscendNPU]: Can not create model runtime instance";
+    return nullptr;
+  }
+  VLOG(3) << "[HWAscendNPU]: Build done";
+  return model_runtime;
+}
+
+}  // namespace hw_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/hw_ascend_npu/device.h
+++ b/lite/backends/hw_ascend_npu/device.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "ge/ge_ir_build.h"  // NOLINT
+#include "lite/backends/hw_ascend_npu/runtime.h"
+namespace paddle {
+namespace lite {
+namespace hw_ascend_npu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {}
+
+  int freq_level() { return freq_level_; }
+  int framework_type() { return framework_type_; }
+  int model_type() { return model_type_; }
+  int device_type() { return device_type_; }
+
+  // Build the IR graph to om model, return a HWAscendNPURuntime instance to
+  // load om model and run inference.
+  std::shared_ptr<HWAscendNPURuntime> Build(
+      std::vector<ge::Operator>& input_nodes,  // NOLINT
+      std::vector<ge::Operator>& output_nodes  // NOLINT
+      );                                       // NOLINT
+
+ private:
+  int freq_level_{3};
+  int framework_type_{0};
+  int model_type_{0};
+  int device_type_{0};
+};
+
+}  // namespace hw_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/hw_ascend_npu/runtime.cc
+++ b/lite/backends/hw_ascend_npu/runtime.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/hw_ascend_npu/runtime.h"
+#include "lite/backends/hw_ascend_npu/target_wrapper.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace hw_ascend_npu {
+HWAscendNPURuntime::HWAscendNPURuntime(
+    std::shared_ptr<uint8_t> model_buff_built, size_t model_buff_size) {
+  model_loaded_ = (0 == LoadModelFromMem(model_buff_built, model_buff_size));
+}
+
+HWAscendNPURuntime::~HWAscendNPURuntime() {
+  UnloadModel();
+  DestroyDesc();
+  DestroyInput();
+  DestroyOutput();
+}
+
+int HWAscendNPURuntime::LoadModelFromMem(
+    std::shared_ptr<uint8_t> model_buff_built, size_t model_buff_size) {
+  if (model_loaded_) {
+    LOG(ERROR) << "[HWAscendNPU]: Has already loaded a model";
+    return 0;
+  }
+  aclError ret = aclmdlQuerySizeFromMem(model_buff_built.get(),
+                                        model_buff_size,
+                                        &model_size_,
+                                        &model_weights_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "[HWAscendNPU]: Can query size from a built model buffer, "
+                  "error code: "
+               << ret;
+    return ret;
+  }
+
+  ret = aclrtMalloc(&model_ptr_, model_size_, ACL_MEM_MALLOC_NORMAL_ONLY);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "[HWAscendNPU]: Can not allocate a device memory for model, "
+                  "error code: "
+               << ret;
+    return ret;
+  }
+
+  ret = aclrtMalloc(
+      &model_weights_ptr_, model_weights_size_, ACL_MEM_MALLOC_NORMAL_ONLY);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "[HWAscendNPU]: Can not allocate a device memory for model "
+                  "weights, error code: "
+               << ret;
+    return ret;
+  }
+
+  ret = aclmdlLoadFromMemWithMem(model_buff_built.get(),
+                                 model_buff_size,
+                                 &model_id_,
+                                 model_ptr_,
+                                 model_size_,
+                                 model_weights_ptr_,
+                                 model_weights_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "[HWAscendNPU]: Can not Load model from memory, error code: "
+               << ret;
+    return ret;
+  }
+
+  model_desc_ = aclmdlCreateDesc();
+  if (model_desc_ == nullptr) {
+    LOG(ERROR) << "HWAscendNPU]: Can not create model descriptor.";
+    return ACL_ERROR_FAILURE;
+  }
+
+  ret = aclmdlGetDesc(model_desc_, model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "[HWAscendNPU]: Can not get model descriptor from model, "
+                  "error code: "
+               << ret;
+    return ret;
+  }
+  return ret;
+}
+
+int HWAscendNPURuntime::CreateInput(const std::vector<DDim>& idims) {
+  if (itensors_ != nullptr) {
+    DestroyInput();
+  }
+  itensors_ = aclmdlCreateDataset();
+  if (itensors_ == nullptr) {
+    LOG(ERROR) << "[HWAscendNPU]: Can not create input dataset";
+    return ACL_ERROR_FAILURE;
+  }
+
+  for (auto& dim : idims) {
+    void* buff_dev_ptr = nullptr;
+    CHECK(ACL_ERROR_NONE == aclrtMalloc(&buff_dev_ptr,
+                                        dim.production(),
+                                        ACL_MEM_MALLOC_NORMAL_ONLY));
+    aclDataBuffer* input_data_buffer =
+        aclCreateDataBuffer(buff_dev_ptr, dim.production());
+    CHECK(input_data_buffer != nullptr);
+    CHECK(ACL_ERROR_NONE ==
+          aclmdlAddDatasetBuffer(itensors_, input_data_buffer));
+  }
+  return 0;
+}
+
+int HWAscendNPURuntime::CreateOutput(const std::vector<DDim>& odims) {
+  if (otensors_ != nullptr) {
+    DestroyOutput();
+  }
+  otensors_ = aclmdlCreateDataset();
+  if (otensors_ == nullptr) {
+    LOG(ERROR) << "[HWAscendNPU]: Can not create output dataset";
+    return ACL_ERROR_FAILURE;
+  }
+
+  for (auto& dim : odims) {
+    void* buff_dev_ptr = nullptr;
+    CHECK(ACL_ERROR_NONE == aclrtMalloc(&buff_dev_ptr,
+                                        dim.production(),
+                                        ACL_MEM_MALLOC_NORMAL_ONLY));
+    aclDataBuffer* output_data_buffer =
+        aclCreateDataBuffer(buff_dev_ptr, dim.production());
+    CHECK(output_data_buffer != nullptr);
+    CHECK(ACL_ERROR_NONE ==
+          aclmdlAddDatasetBuffer(otensors_, output_data_buffer));
+  }
+  return 0;
+}
+
+void HWAscendNPURuntime::UnloadModel() {
+  if (!model_loaded_) {
+    LOG(ERROR) << "[HWAscendNPU]: No model has been loaded";
+    return;
+  }
+  aclError ret = ACL_ERROR_NONE;
+  ret = aclmdlUnload(model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "[HWAscendNPU]: Unload a model, error code: " << ret;
+    return;
+  }
+
+  if (model_ptr_) {
+    aclrtFree(model_ptr_);
+    model_ptr_ = nullptr;
+  }
+
+  if (model_weights_ptr_) {
+    aclrtFree(model_weights_ptr_);
+    model_weights_ptr_ = nullptr;
+  }
+  model_loaded_ = false;
+}
+
+void HWAscendNPURuntime::DestroyDesc() {
+  if (model_desc_) {
+    (void)aclmdlDestroyDesc(model_desc_);
+    model_desc_ = nullptr;
+  }
+}
+
+void HWAscendNPURuntime::DestroyInput() {
+  if (itensors_ == nullptr) {
+    return;
+  }
+  size_t buf_num = aclmdlGetDatasetNumBuffers(itensors_);
+  for (size_t i = 0; i < buf_num; ++i) {
+    aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(itensors_, i);
+    aclDestroyDataBuffer(data_buffer);
+  }
+  aclmdlDestroyDataset(itensors_);
+  itensors_ = nullptr;
+}
+
+void HWAscendNPURuntime::DestroyOutput() {
+  if (otensors_ == nullptr) {
+    return;
+  }
+  size_t buf_num = aclmdlGetDatasetNumBuffers(otensors_);
+  for (size_t i = 0; i < buf_num; ++i) {
+    aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(otensors_, i);
+    aclDestroyDataBuffer(data_buffer);
+  }
+  aclmdlDestroyDataset(otensors_);
+  otensors_ = nullptr;
+}
+
+int HWAscendNPURuntime::SetInput(const std::vector<Tensor*>& itensors,
+                                 const std::vector<DDim>& idims) {
+  CHECK(itensors.size() == idims.size());
+  size_t input_tensor_num = itensors.size();
+  for (size_t i = 0; i < input_tensor_num; ++i) {
+    CHECK(itensors[i]->memory_size() == idims[i].production());
+  }
+  size_t num_buffers_in_dataset = aclmdlGetDatasetNumBuffers(itensors_);
+  if (num_buffers_in_dataset != input_tensor_num) {
+    if (0 != CreateInput(idims)) {
+      return -1;
+    }
+  } else {
+    bool need_to_create_input = false;
+    for (size_t i = 0; i < num_buffers_in_dataset; ++i) {
+      aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(itensors_, i);
+      int64_t buf_size = aclGetDataBufferSize(data_buffer);
+      if (buf_size != idims[i].production()) {
+        need_to_create_input = true;
+      }
+    }
+    if (need_to_create_input && 0 != CreateInput(idims)) {
+      return -1;
+    }
+  }
+
+  // copy input data from host to device
+  for (size_t i = 0; i < input_tensor_num; ++i) {
+    aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(itensors_, i);
+    void* buf_dev_ptr = aclGetDataBufferAddr(data_buffer);
+    TargetWrapperHWAscendNPU::MemcpySync(buf_dev_ptr,
+                                         itensors[i]->raw_data(),
+                                         itensors[i]->memory_size(),
+                                         IoDirection::HtoD);
+  }
+  return 0;
+}
+
+void HWAscendNPURuntime::GetOutput(const std::vector<Tensor*>* otensors_ptr) {
+  CHECK(otensors_ptr != nullptr);
+  size_t num_output = aclmdlGetDatasetNumBuffers(otensors_);
+  const std::vector<Tensor*> otensors = *otensors_ptr;
+
+  CHECK(num_output == otensors.size());
+  for (size_t i = 0; i < num_output; ++i) {
+    aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(otensors_, i);
+    TargetWrapperHWAscendNPU::MemcpySync(otensors[i]->raw_data(),
+                                         aclGetDataBufferAddr(data_buffer),
+                                         aclGetDataBufferSize(data_buffer),
+                                         IoDirection::DtoH);
+  }
+}
+
+int HWAscendNPURuntime::Process() {
+  aclError ret = aclmdlExecute(model_id_, itensors_, otensors_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "[HWAscendNPU]: Execute model failed, model_id: " << model_id_
+               << ", error code: " << ret;
+  }
+  return ret;
+}
+
+int HWAscendNPURuntime::GetModelIOTensorDim(std::vector<TensorDesc>* idims,
+                                            std::vector<TensorDesc>* odims) {
+  aclError ret = ACL_ERROR_NONE;
+  size_t num_inputs = aclmdlGetNumInputs(model_desc_);
+  size_t num_outputs = aclmdlGetNumOutputs(model_desc_);
+  for (size_t i = 0; i < num_inputs; ++i) {
+    aclmdlIODims dims;
+    if (ret != aclmdlGetInputDims(model_desc_, i, &dims)) {
+      LOG(ERROR) << "[HWAscendNPU]: Get input dims failed, index: " << i;
+      return ret;
+    }
+    aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
+    aclFormat format = aclmdlGetInputFormat(model_desc_, i);
+
+    idims->push_back(TensorDesc(data_type, dims, format));
+  }
+
+  for (size_t i = 0; i < num_outputs; ++i) {
+    aclmdlIODims dims;
+    if (ret != aclmdlGetOutputDims(model_desc_, i, &dims)) {
+      LOG(ERROR) << "[HWAscendNPU]: Get output dims failed, index: " << i;
+      return ret;
+    }
+    aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
+    aclFormat format = aclmdlGetOutputFormat(model_desc_, i);
+
+    odims->push_back(TensorDesc(data_type, dims, format));
+  }
+  return 0;
+}
+}  // namespace hw_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/hw_ascend_npu/runtime.h
+++ b/lite/backends/hw_ascend_npu/runtime.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// hw_ascend_npu runtime library
+#include <acl/acl.h>
+#include <acl/tensor.h>
+#include <memory>
+#include <vector>
+#include "lite/core/tensor.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace hw_ascend_npu {
+class TensorDesc {
+ public:
+  TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) {
+    tensor_desc_ =
+        aclCreateTensorDesc(data_type, dims.dimCount, dims.dims, format);
+    CHECK(tensor_desc_ != nullptr);
+    aclSetTensorDescName(tensor_desc_, dims.name);
+    if (format == ACL_FORMAT_NHWC) {
+      dim_order[1] = 3;
+      dim_order[2] = 1;
+      dim_order[3] = 2;
+    }
+  }
+  ~TensorDesc() {
+    if (tensor_desc_ != nullptr) {
+      aclDestroyTensorDesc(tensor_desc_);
+      tensor_desc_ = nullptr;
+    }
+  }
+  uint32_t GetNumber() const {
+    return static_cast<uint32_t>(
+        aclGetTensorDescDim(tensor_desc_, dim_order[0]));
+  }
+  uint32_t GetChannel() const {
+    return static_cast<uint32_t>(
+        aclGetTensorDescDim(tensor_desc_, dim_order[1]));
+  }
+  uint32_t GetHeight() const {
+    return static_cast<uint32_t>(
+        aclGetTensorDescDim(tensor_desc_, dim_order[2]));
+  }
+  uint32_t GetWidth() const {
+    return static_cast<uint32_t>(
+        aclGetTensorDescDim(tensor_desc_, dim_order[3]));
+  }
+  const aclTensorDesc& GetTensorDesc() const { return *tensor_desc_; }
+
+ private:
+  aclTensorDesc* tensor_desc_{nullptr};
+  // n c h w order, default to ACL_FORMAT_NCHW
+  std::vector<uint32_t> dim_order{0, 1, 2, 3};
+};
+
+class HWAscendNPURuntime {
+ public:
+  HWAscendNPURuntime(std::shared_ptr<uint8_t> model_buff_built,
+                     size_t model_buff_size);
+  ~HWAscendNPURuntime();
+
+  int SetInput(const std::vector<Tensor*>& itensors,
+               const std::vector<DDim>& idims);
+  void GetOutput(const std::vector<Tensor*>* otensors_ptr);
+  int Process();
+  bool model_loaded() const { return model_loaded_; }
+  int CreateInput(const std::vector<DDim>& idims);
+  int CreateOutput(const std::vector<DDim>& odims);
+  int GetModelIOTensorDim(std::vector<TensorDesc>* idims,
+                          std::vector<TensorDesc>* odims);
+
+ private:
+  int LoadModelFromMem(std::shared_ptr<uint8_t> model_buff_built,
+                       size_t model_buff_size);
+
+  void UnloadModel();
+  void DestroyDesc();
+  void DestroyInput();
+  void DestroyOutput();
+
+ private:
+  aclmdlDataset* itensors_{nullptr};
+  aclmdlDataset* otensors_{nullptr};
+  uint32_t model_id_{0};
+  void* model_ptr_{nullptr};
+  void* model_weights_ptr_{nullptr};
+  size_t model_size_{0};
+  size_t model_weights_size_{0};
+  bool model_loaded_{false};
+  aclmdlDesc* model_desc_;
+};
+}  // namespace hw_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/hw_ascend_npu/target_wrapper.cc
+++ b/lite/backends/hw_ascend_npu/target_wrapper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/hw_ascend_npu/target_wrapper.h"
+#include <acl/acl.h>
+#include <glog/logging.h>
+
+namespace paddle {
+namespace lite {
+
+void* TargetWrapperHWAscendNPU::Malloc(size_t size) {
+  void* ptr{nullptr};
+  if (ACL_ERROR_NONE != aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_NORMAL_ONLY)) {
+    LOG(ERROR) << "[HWAscendNPU]: Allocate memory from device failed";
+    ptr = nullptr;
+  }
+  return ptr;
+}
+
+void TargetWrapperHWAscendNPU::Free(void* ptr) { aclrtFree(ptr); }
+
+void TargetWrapperHWAscendNPU::MemcpySync(void* dst,
+                                          const void* src,
+                                          size_t size,
+                                          IoDirection dir) {
+  switch (dir) {
+    case IoDirection::HtoD:
+      aclrtMemcpy(dst, size, src, size, ACL_MEMCPY_HOST_TO_DEVICE);
+      break;
+    case IoDirection::DtoH:
+      aclrtMemcpy(dst, size, src, size, ACL_MEMCPY_DEVICE_TO_HOST);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/hw_ascend_npu/target_wrapper.h
+++ b/lite/backends/hw_ascend_npu/target_wrapper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+template <>
+class TargetWrapper<TARGET(kHWAscendNPU)> {
+ public:
+  static size_t num_devices() { return 4; }
+  static size_t maximum_stream() { return 0; }
+
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+};
+
+using TargetWrapperHWAscendNPU = TargetWrapper<TARGET(kHWAscendNPU)>;
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -535,6 +535,7 @@ class ContextScheduler {
            .As<HWAscendNPUContext>()
            .CopySharedTo(&ctx->As<HWAscendNPUContext>());
        break;
+#endif
 #ifdef LITE_WITH_MLU
      case TARGET(kMLU): {
        int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();

--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -17,7 +17,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
-        HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
+        HWAscendNPU_DEPS ${hw_ascend_npu_kernels}
        FPGA_DEPS ${fpga_kernels}
        EXCLUDE_COMPILE_DEPS "ON"
        ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)

--- a/lite/kernels/hw_ascend_npu/CMakeLists.txt
+++ b/lite/kernels/hw_ascend_npu/CMakeLists.txt
 add_subdirectory(bridges)
+
+add_kernel(subgraph_compute_hw_ascend_npu HWAscendNPU basic SRCS subgraph_compute.cc DEPS
+  ${lite_kernel_deps} device_hw_ascend_npu subgraph_bridge_engine ${hw_ascend_npu_subgraph_bridges})
--- a/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt
+++ b/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt
@@ -2,11 +2,15 @@ if (NOT LITE_WITH_HW_ASCEND_NPU)
  return()
 endif()

-message(STATUS "======compile hw_ascend_npu bridges, ${ascend_builder_libs}")
+message(STATUS "======compile hw_ascend_npu bridges, ${hw_ascend_npu_builder_libs}")
+
+lite_cc_library(subgraph_bridge_utility_hw_ascend_npu
+  SRCS utility.cc
+  DEPS ${hw_ascend_npu_builder_libs} tensor)

 lite_cc_library(subgraph_bridge_graph_hw_ascend_npu
  SRCS graph.cc
-  DEPS ${ascend_builder_libs})
+  DEPS ${hw_ascend_npu_builder_libs} subgraph_bridge_utility_hw_ascend_npu)

 set(hw_ascend_npu_subgraph_bridges
  subgraph_bridge_graph_hw_ascend_npu

--- a/lite/kernels/hw_ascend_npu/bridges/act_op.cc
+++ b/lite/kernels/hw_ascend_npu/bridges/act_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
+#include "lite/kernels/hw_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace hw_ascend_npu {
+
+template <typename ActType>
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HWAscendNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Act node
+  auto act_node = graph->template Add<ActType>(out_name);
+  auto act_op = act_node->template data<ActType>();
+  act_op->set_input_x(*x_node->data());
+
+  return SUCCESS;
+}
+
+template <>
+int ActConverter<ge::op::Activation>(void* ctx,
+                                     OpLite* op,
+                                     KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HWAscendNPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Act node
+  auto act_node = graph->template Add<ge::op::Activation>(out_name);
+  auto act_op = act_node->template data<ge::op::Activation>();
+  act_op->set_input_x(*x_node->data());
+  // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
+  // clipped_relu etc.
+  act_op->set_attr_mode(CvtActMode(op_type));
+  if (op_type == "relu_clipped") {
+    auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
+    act_op->set_attr_coef(Relu_clipped_coef);
+  } else if (op_type == "relu6") {
+    float Relu_clipped_coef = 6.f;
+    act_op->set_attr_coef(Relu_clipped_coef);
+  } else if (op_type == "leaky_relu") {
+    auto alpha = op_info->GetAttr<float>("alpha");
+    act_op->set_attr_negative_slope(alpha);
+  } else if (op_type == "hard_sigmoid") {
+    auto slope = op_info->GetAttr<float>("slope");
+    auto offset = op_info->GetAttr<float>("offset");
+    act_op->set_attr_negative_slope(slope);
+    act_op->set_attr_coef(offset);
+  }
+  return SUCCESS;
+}
+
+}  // namespace hw_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(
+    sigmoid,
+    kHWAscendNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu,
+    kHWAscendNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    tanh,
+    kHWAscendNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu_clipped,
+    kNPU,
+    paddle::lite::subgraph::npu::ActConverter<ge::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu6,
+    kHWAscendNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    leaky_relu,
+    kHWAscendNPU,
+    paddle::lite::subgraph::npu::ActConverter<ge::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    abs,
+    kHWAscendNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    softsign,
+    kNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    softplus,
+    kHWAscendNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    hard_sigmoid,
+    kHWAscendNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
+
+REGISTER_SUBGRAPH_BRIDGE(
+    log,
+    kHWAscendNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Log>);
+REGISTER_SUBGRAPH_BRIDGE(
+    square,
+    kHWAscendNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Square>);
+REGISTER_SUBGRAPH_BRIDGE(
+    sqrt,
+    kHWAscendNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Sqrt>);
--- a/lite/kernels/hw_ascend_npu/bridges/graph.cc
+++ b/lite/kernels/hw_ascend_npu/bridges/graph.cc
@@ -14,8 +14,8 @@

 #include "lite/kernels/hw_ascend_npu/bridges/graph.h"
 /// reference from opp package
-#include <all_ops.h>
 #include <utility>
+#include "lite/kernels/hw_ascend_npu/utility.h"

 namespace paddle {
 namespace lite {
@@ -49,8 +49,9 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
  PrecisionType precision = tensor.precision();
  if (tensor.persistable()) {
    // Const node
-    node = Add<ge::Const>(name, precision, layout);
-    node->data<ge::Const>()->set_attr_value(CvtTensor(tensor, shape, layout));
+    node = Add<ge::op::Const>(name, precision, layout);
+    node->data<ge::op::Const>()->set_attr_value(
+        CvtTensor(tensor, shape, layout));
  } else {
    // Data node
    node = Add(name, shape, precision, layout);
@@ -63,10 +64,10 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
                                 std::vector<int64_t> shape,
                                 PrecisionType precision,
                                 DataLayoutType layout) {
-  auto node = Add<ge::Data>(name, precision, layout);
+  auto node = Add<ge::op::Data>(name, precision, layout);
  ge::TensorDesc desc(
      ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision));
-  node->data<ge::Data>()->update_input_desc_x(desc);
+  node->data<ge::op::Data>()->update_input_desc_data(desc);
  return node;
 }


--- a/lite/kernels/hw_ascend_npu/bridges/graph.h
+++ b/lite/kernels/hw_ascend_npu/bridges/graph.h
@@ -14,13 +14,15 @@

 #pragma once

+// reference from atc package
+#include <all_ops.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
-// reference from atc package
-#include "graph/operators.h"
+#include "graph/operator.h"
+#include "graph/operator_reg.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"


--- a/lite/kernels/hw_ascend_npu/bridges/utility.cc
+++ b/lite/kernels/hw_ascend_npu/bridges/utility.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/hw_ascend_npu/utility.h"
+#include <utility>
+#include <algorithm
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace hw_ascend_npu {
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+
+ge::DataType CvtPrecisionType(PrecisionType itype) {
+  ge::DataType otype = ge::DT_FLOAT;
+  switch (itype) {
+    case PRECISION(kFloat):
+      otype = ge::DT_FLOAT;
+      break;
+    case PRECISION(kInt8):
+      otype = ge::DT_INT8;
+      break;
+    case PRECISION(kInt32):
+      otype = ge::DT_INT32;
+      break;
+    case PRECISION(kFP16):
+      otype = ge::DT_FLOAT16;
+      break;
+    case PRECISION(kBool):
+      otype = ge::DT_BOOL;
+      break;
+    case PRECISION(kInt64):
+      otype = ge::DT_INT64;
+      break;
+    case PRECISION(kInt16):
+      otype = ge::DT_INT16;
+      break;
+    default:
+      LOG(FATAL) << "[HW_ASCEND_NPU] Can not convert precision type("
+                 << PrecisionToStr(itype) << ") from Lite to HW_ASCEND_NPU";
+      break;
+  }
+  return otype;
+}
+
+ge::Format CvtDataLayoutType(DataLayoutType itype) {
+  ge::Format otype = ge::FORMAT_NCHW;
+  switch (itype) {
+    case DATALAYOUT(kNCHW):
+      otype = ge::FORMAT_NCHW;
+      break;
+    case DATALAYOUT(kNHWC):
+      otype = ge::FORMAT_NHWC;
+      break;
+    // TODO(yanghongtian): support more data layout type
+    default:
+      LOG(FATAL) << "[HW_ASCEND_NPU] Can not convert data layout type("
+                 << DataLayoutToStr(itype) << ") from Lite to HW_ASCEND_NPU";
+      break;
+  }
+  return otype;
+}
+
+std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape) {
+  CHECK(in_shape.size() <= 4 && in_shape.size() > 0)
+      << "[HW_ASCEND_NPU] The size of in_shape is invalid: " << in_shape.size();
+  // Padding the shape to 4-dimensions(NCHW)
+  std::vector<int64_t> out_shape(4, 1);
+  std::copy(in_shape.begin(),
+            in_shape.end(),
+            out_shape.begin() + 4 - in_shape.size());
+  return out_shape;
+}
+
+std::vector<int64_t> CvtShape(const DDim& in_dims) {
+  return CvtShape(in_dims.Vectorize());
+}
+
+ge::Tensor CvtTensor(const Tensor& in_tensor,
+                     std::vector<int64_t> out_shape,
+                     DataLayoutType in_layout) {
+  PrecisionType in_precision = in_tensor.precision();
+  auto in_size = in_tensor.dims().production();
+  auto in_shape = in_tensor.dims().Vectorize();
+  if (out_shape.empty()) {
+    out_shape = in_shape;
+  }
+  ge::TensorDesc out_desc(ge::Shape(out_shape),
+                          CvtDataLayoutType(in_layout),
+                          CvtPrecisionType(in_precision));
+  auto out_size = out_desc.GetShape().GetShapeSize();
+  CHECK_EQ(out_size, in_size);
+  ge::Tensor out_tensor;
+  out_tensor.SetTensorDesc(out_desc);
+  out_tensor.SetData(reinterpret_cast<const uint8_t*>(in_tensor.raw_data()),
+                     in_tensor.memory_size());
+  return std::move(out_tensor);
+}
+
+int CvtActMode(const std::string& act_type) {
+  // based on the nonlinear_fuc_ops.h in OPP (line 210)
+  // default to Relu
+  int act_mode = 1;
+  if (act_type == "sigmoid") {
+    act_mode = 0;
+  } else if (act_type == "relu") {
+    act_mode = 1;
+  } else if (act_type == "tanh") {
+    act_mode = 2;
+  } else if (act_type == "relu_clipped" || act_type == "relu6") {
+    act_mode = 3;
+  } else if (act_type == "elu") {
+    act_mode = 4;
+  } else if (act_type == "leaky_relu") {
+    act_mode = 5;
+  } else if (act_type == "abs") {
+    act_mode = 6;
+  } else if (act_type == "relu1") {
+    // TODO(yanghongtian): check hw_ascend_npu supports relu1 or not.
+    act_mode = 7;
+  } else if (act_type == "softsign") {
+    act_mode = 8;
+  } else if (act_type == "softplus") {
+    act_mode = 9;
+  } else {
+    // TODO(yanghongtian): support more activation mode
+    LOG(FATAL) << "[NPU] Unsupported activation type " << act_type;
+  }
+  return act_mode;
+}
+
+}  // namespace hw_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/hw_ascend_npu/bridges/utility.h
+++ b/lite/kernels/hw_ascend_npu/bridges/utility.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <graph/tensor.h>
+#include <graph/types.h>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace hw_ascend_npu {
+
+// Type/tensor converters for converting Paddle type/tensor to hw ascend npu IR
+// type
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+ge::DataType CvtPrecisionType(PrecisionType itype);
+
+ge::Format CvtDataLayoutType(DataLayoutType itype);
+
+// Padding the shape to 4-dimensions(NCHW) for HW_ASCEND_NPU
+std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape);
+
+std::vector<int64_t> CvtShape(const DDim& in_dims);
+
+ge::Tensor CvtTensor(const Tensor& in_tensor,
+                     std::vector<int64_t> out_shape = {},
+                     DataLayoutType in_layout = DATALAYOUT(kNCHW));
+
+int CvtActMode(std::string act_type);
+}  // namespace hw_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/hw_ascend_npu/subgraph_compute.cc
+++ b/lite/kernels/hw_ascend_npu/subgraph_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/hw_ascend_npu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/backends/hw_ascend_npu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace hw_ascend_npu {
+
+int SubgraphEngine::BuildDeviceProgram() {
+  int status = 0;
+  // Convert all of ops and their input vars and weights and added into
+  // the HWAscendNPU IR graph
+  subgraph::hw_ascend_npu::Graph graph;
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = const_cast<OpLite*>(inst.op());
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kHWAscendNPU))) {
+      return subgraph::FAILED;
+    }
+    auto kernel = inst.kernel();
+    status |= bridges.Select(op_type, TARGET(kHWAscendNPU))(
+        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  // Collect the valid input and output nodes in the HiAI IR graph and update
+  // the input and output names
+  device_inames_.clear();
+  device_onames_.clear();
+  std::vector<ge::Operator> device_inodes;
+  std::vector<ge::Operator> device_onodes;
+  for (auto& input_name : input_names_) {
+    if (graph.Has(input_name)) {
+      if (graph.Get(input_name)->is_data()) {
+        device_inodes.push_back(*graph.Get(input_name)->data());
+        device_inames_.push_back(input_name);
+      } else {
+        LOG(WARNING) << "[HWAscendNPU] Input node " << input_name
+                     << " is ignored because it is not a data node.";
+      }
+    } else {
+      LOG(WARNING) << "[HWAscendNPU] Input node " << input_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+  for (auto& output_name : output_names_) {
+    if (graph.Has(output_name)) {
+      device_onodes.push_back(*graph.Get(output_name)->data());
+      device_onames_.push_back(output_name);
+    } else {
+      LOG(WARNING) << "[HWAscendNPU] Output node " << output_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+  CHECK(!device_inames_.empty())
+      << "[HWAscendNPU] No input nodes found for building NPU model";
+  CHECK(!device_onames_.empty())
+      << "[HWAscendNPU] No output nodes found for building NPU model";
+
+  // Build the IR graph to om model as the device program
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return status;
+  }
+  auto device_client =
+      lite::hw_ascend_npu::Device::Global().Build(device_inodes, device_onodes);
+  if (device_client == nullptr) {
+    LOG(WARNING) << "[HWAscendNPU] Build model failed!";
+    return subgraph::FAILED;
+  }
+  auto device_program = std::make_shared<device_program_t>(device_client);
+  device_program_map_[inputs_shape_] = device_program;
+
+  // Query and check the dimensions of valid input and output tensors
+  std::vector<TensorDesc> device_idims, device_odims;
+  if (device_program->client->GetModelIOTensorDim(&device_idims,
+                                                  &device_odims) != 0) {
+    LOG(WARNING) << "[HWAscendNPU] Get the dimensions of input and output "
+                    "tensors failed!";
+    return subgraph::FAILED;
+  }
+  device_program->device_idims = device_idims;
+  device_program->device_odims = device_odims;
+
+  CHECK_EQ(device_idims.size(), device_inames_.size());
+  CHECK_EQ(device_odims.size(), device_onames_.size());
+  origin_idims_.resize(device_inames_.size());
+  origin_itensors_.resize(device_inames_.size());
+  origin_odims_.resize(device_onames_.size());
+  origin_otensors_.resize(device_onames_.size());
+
+  for (size_t i = 0; i < device_inames_.size(); i++) {
+    auto node = graph.Get(device_inames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+    VLOG(3) << "[HWAscendNPU] Inputs[" << i << "] name: " << device_inames_[i]
+            << " precision: " << PrecisionToStr(precision)
+            << " layout: " << DataLayoutToStr(layout) << " dims: {"
+            << device_idims[i].GetNumber() << ","
+            << device_idims[i].GetChannel() << ","
+            << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth()
+            << "}";
+    // Prepare the device input tensors
+    CHECK_EQ(origin_idims_[i].production(),
+             device_idims[i].GetNumber() * device_idims[i].GetChannel() *
+                 device_idims[i].GetHeight() * device_idims[i].GetWidth());
+  }
+  device_program->origin_idims = origin_idims_;
+
+  for (size_t i = 0; i < device_onames_.size(); i++) {
+    auto node = graph.Get(device_onames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    VLOG(3) << "[HWAscendNPU] Outputs[" << i << "] name: " << device_onames_[i]
+            << " precision: " << PrecisionToStr(precision)
+            << " layout: " << DataLayoutToStr(layout) << " dims: {"
+            << device_odims[i].GetNumber() << ","
+            << device_odims[i].GetChannel() << ","
+            << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth()
+            << "}";
+    // Prepare the device output tensors
+    switch (precision) {
+      case PRECISION(kFloat):
+        origin_otensors_[i]->mutable_data<float>();
+        break;
+      case PRECISION(kBool):
+        origin_otensors_[i]->mutable_data<bool>();
+        break;
+      case PRECISION(kInt8):
+        origin_otensors_[i]->mutable_data<int8_t>();
+        break;
+      case PRECISION(kInt16):
+        origin_otensors_[i]->mutable_data<int16_t>();
+        break;
+      case PRECISION(kInt32):
+        origin_otensors_[i]->mutable_data<int32_t>();
+        break;
+      case PRECISION(kInt64):
+        origin_otensors_[i]->mutable_data<int64_t>();
+        break;
+      default:
+        LOG(FATAL) << "[HWAscendNPU] " << device_onames_[i]
+                   << " can't mutable data with precision type "
+                   << PrecisionToStr(precision);
+        break;
+    }
+    device_program->origin_odims = origin_odims_;
+
+    CHECK_EQ(origin_odims_[i].production(),
+             device_odims[i].GetNumber() * device_odims[i].GetChannel() *
+                 device_odims[i].GetHeight() * device_odims[i].GetWidth());
+  }
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  // Copy the data of origin input tensors to the buffer of input HWAscendNPU
+  // tensors
+  auto device_program = device_program_map_[inputs_shape_];
+  int ret = 0;
+
+  ret = device_program->client->SetInput(origin_itensors_,
+                                         device_program->origin_idims);
+  if (ret != 0) {
+    return ret;
+  }
+
+  device_program->client->CreateOutput(device_program->origin_odims);
+
+  // run inference
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  auto start_time = GetCurrentUS();
+  CHECK_EQ(device_program->client->Process(), 0);
+  VLOG(3) << "[HWAscendNPU] Process cost " << GetCurrentUS() - start_time
+          << " us";
+
+  device_program->client->GetOutput(&origin_otensors_);
+
+  return 0;
+}
+
+bool SubgraphEngine::InputShapeChanged() {
+  std::vector<std::vector<int64_t>> new_shape;
+  for (auto origin_itensor : origin_itensors_) {
+    new_shape.push_back(origin_itensor->dims().Vectorize());
+  }
+  inputs_shape_ = new_shape;
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return false;
+  }
+  return true;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace hw_ascend_npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kHWAscendNPU,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::hw_ascend_npu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .Finalize();
--- a/lite/kernels/hw_ascend_npu/subgraph_compute.h
+++ b/lite/kernels/hw_ascend_npu/subgraph_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <graph/tensor.h>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/hw_ascend_npu/runtime.h"
+#include "lite/core/kernel.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+using HWAscendNPURuntime = paddle::lite::hw_ascend_npu::HWAscendNPURuntime;
+using TensorDesc = paddle::lite::hw_ascend_npu::TensorDesc;
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace hw_ascend_npu {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+
+  struct device_program_t {
+    explicit device_program_t(std::shared_ptr<HWAscendNPURuntime> _client)
+        : client(_client) {}
+    std::shared_ptr<HWAscendNPURuntime> client{nullptr};
+    std::vector<DDim> origin_idims{};
+    std::vector<DDim> origin_odims{};
+    std::vector<TensorDesc> device_idims{};
+    std::vector<TensorDesc> device_odims{};
+  };
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+  bool InputShapeChanged() override;
+
+  std::vector<std::vector<int64_t>> inputs_shape_{};
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
+      device_program_map_{};
+  std::vector<std::string> device_inames_{};
+  std::vector<std::string> device_onames_{};
+};
+
+class SubgraphCompute
+    : public KernelLite<TARGET(kHWAscendNPU), PRECISION(kAny)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace hw_ascend_npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_HW_ASCEND_NPU)
  return()
 endif()