collect all

5cab7cdd · yanghongtian · f6cf1f9f · 5cab7cdd · 5cab7cdd · 5cab7cdd
20 changed file
--- a/cmake/device/hw_ascend_npu.cmake
+++ b/cmake/device/hw_ascend_npu.cmake
@@ -24,33 +24,6 @@ if(NOT DEFINED ASCEND_HOME)
 endif()

 message(STATUS "LITE_WITH_HW_ASCEND_NPU: ${LITE_WITH_HW_ASCEND_NPU}")
-find_path(ACL_INC NAMES acl/acl.h
-  PATHS ${ASCEND_HOME}/acllib/include NO_DEFAULT_PATH)
-if(NOT ACL_INC)
-  message(FATAL_ERROR "Can not find acl/acl.h in ${ASCEND_HOME}/include")
-endif()
-
-include_directories("${ACL_INC}")
-
-set(ACL_LIB_FILES
-  acl_dvpp
-  ascendcl
-  register
-  runtime
-  )
-
-foreach (libname ${ACL_LIB_FILES})
-  find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/acllib/lib64)
-  if (lib_name_path_${libname})
-    add_library(acl_${libname} SHARED IMPORTED GLOBAL)
-    set_property(TARGET acl_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
-    list(APPEND acl_libs acl_${libname})
-  else()
-    message(FATAL_ERROR "can not find library: ${libname}")
-  endif()
-endforeach()
-
-set(hw_ascend_npu_runtime_libs ${acl_libs} CACHE INTERNAL "ascend runtime libs")

 # find atc include folder and library
 find_path(ATC_INC NAMES ge/ge_ir_build.h
@@ -61,6 +34,8 @@ endif()
 include_directories("${ATC_INC}")

 set(ATC_LIB_FILES
+  ge_compiler
+  graph
  _caffe_parser
  auto_tiling
  c_sec
@@ -76,9 +51,7 @@ set(ATC_LIB_FILES
  fmk_tensorflow_parser
  ge_client
  ge_common
-  ge_compiler
  ge_executor
-  graph
  mmpa
  msprof
  parser_common
@@ -92,6 +65,16 @@ set(ATC_LIB_FILES
  tvm_runtime
  tvm_topi
  )
+set(ATC_PLUGIN_NNENGIN_LIB_FILES
+  engine
+  )
+set(ATC_PLUGIN_OPSKERNEL_LIB_FILES
+  aicpu_engine
+  fe
+  ge_local_engine
+  rts_engine
+  )
+

 foreach (libname ${ATC_LIB_FILES})
  find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64)
@@ -104,6 +87,28 @@ foreach (libname ${ATC_LIB_FILES})
  endif()
 endforeach()

+foreach (libname ${ATC_PLUGIN_NNENGIN_LIB_FILES})
+  find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64/plugin/nnengine)
+  if (lib_name_path_${libname})
+    add_library(atc_${libname} SHARED IMPORTED GLOBAL)
+    set_property(TARGET atc_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
+    list(APPEND atc_libs atc_${libname})
+  else()
+    message(FATAL_ERROR "can not find library: ${libname}")
+  endif()
+endforeach()
+
+foreach (libname ${ATC_PLUGIN_OPSKERNEL_LIB_FILES})
+  find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64/plugin/opskernel)
+  if (lib_name_path_${libname})
+    add_library(atc_${libname} SHARED IMPORTED GLOBAL)
+    set_property(TARGET atc_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
+    list(APPEND atc_libs atc_${libname})
+  else()
+    message(FATAL_ERROR "can not find library: ${libname}")
+  endif()
+endforeach()
+
 # find opp include folder and library
 find_path(OPP_INC NAMES all_ops.h
  PATHS ${ASCEND_HOME}/opp/op_proto/built-in/inc)
@@ -139,10 +144,59 @@ else()
  set_property(TARGET opp_fusion_pass_vectorcore_lib PROPERTY IMPORTED_LOCATION ${OPP_FUSION_VECTORCORE})
 endif()

-set(hw_ascend_npu_builder_libs
+add_library(hw_ascend_npu_builder_libs INTERFACE)
+target_link_libraries(hw_ascend_npu_builder_libs INTERFACE
+  ${atc_libs}
+  opp_opsproto_lib
+  opp_fusion_pass_aicore_lib
+  opp_fusion_pass_vectorcore_lib)
+
+#set(hw_ascend_npu_builder_libs
+#  ${atc_libs}
+#  opp_opsproto_lib
+#  opp_fusion_pass_aicore_lib
+#  opp_fusion_pass_vectorcore_lib
+#  CACHE INTERNAL "ascend builder libs")
+
+# find ascend cl runtime library
+find_path(ACL_INC NAMES acl/acl.h
+  PATHS ${ASCEND_HOME}/acllib/include NO_DEFAULT_PATH)
+if(NOT ACL_INC)
+  message(FATAL_ERROR "Can not find acl/acl.h in ${ASCEND_HOME}/include")
+endif()
+
+include_directories("${ACL_INC}")
+
+set(ACL_LIB_FILES
+  acl_dvpp
+  ascendcl
+  register
+  runtime
+  )
+
+foreach (libname ${ACL_LIB_FILES})
+  find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/acllib/lib64)
+  if (lib_name_path_${libname})
+    add_library(acl_${libname} SHARED IMPORTED GLOBAL)
+    set_property(TARGET acl_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
+    list(APPEND acl_libs acl_${libname})
+  else()
+    message(FATAL_ERROR "can not find library: ${libname}")
+  endif()
+endforeach()
+
+add_library(hw_ascend_npu_runtime_libs INTERFACE)
+target_link_libraries(hw_ascend_npu_runtime_libs INTERFACE ${acl_libs})
+
+add_library(hw_ascend_npu_libs INTERFACE)
+target_link_libraries(hw_ascend_npu_libs INTERFACE
  ${atc_libs}
  opp_opsproto_lib
  opp_fusion_pass_aicore_lib
  opp_fusion_pass_vectorcore_lib
-  CACHE INTERNAL "ascend builder libs")
+  ${acl_libs})
+
+# set(hw_ascend_npu_runtime_libs ${acl_libs} CACHE INTERNAL "ascend runtime libs")
+
+

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -23,7 +23,7 @@ function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS
-    CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS ARGS)
+    CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -138,7 +138,7 @@ function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
-      XPU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
+      XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

    set(deps "")
@@ -156,7 +156,7 @@ function(lite_cc_library TARGET)
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
-            # MLU_DEPS ${args_MLU_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
      )

@@ -185,7 +185,7 @@ function(lite_cc_binary TARGET)
    endif()
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
-      XPU_DEPS PROFILE_DEPS HW_ASCEND_NPU_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
+      XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

    set(deps "")
@@ -204,7 +204,7 @@ function(lite_cc_binary TARGET)
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            CV_DEPS ${CV_DEPS}
-            # MLU_DEPS ${args_MLU_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -235,7 +235,7 @@ function(lite_cc_test TARGET)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
-        XPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS HW_ASCEND_NPU_DEPS
+        XPU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS COMPILE_LEVEL # (basic|extra)
    )
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -261,7 +261,7 @@ function(lite_cc_test TARGET)
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
              CV_DEPS ${args_CV_DEPS}
-              # MLU_DEPS ${args_MLU_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -309,8 +309,8 @@ endif()
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
-        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS HW_ASCEND_NPU_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
+      XPU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
      ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -444,7 +444,7 @@ function(add_kernel TARGET device level)
              XPU_DEPS ${args_XPU_DEPS}
              HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
 	            BM_DEPS ${args_BM_DEPS}
-              #MLU_DEPS ${args_MLU_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -463,8 +463,8 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
-        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
+      XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
      ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -499,7 +499,7 @@ function(add_operator TARGET level)
              XPU_DEPS ${args_XPU_DEPS}
              HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
 	            BM_DEPS ${args_BM_DEPS}
-              #MLU_DEPS ${args_MLU_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -69,7 +69,7 @@ if (WITH_TESTING)
      XPU_DEPS ${xpu_kernels}
      BM_DEPS ${bm_kernels}
      HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
-      # MLU_DEPS ${mlu_kernels}
+      MLU_DEPS ${mlu_kernels}
    )
 endif()
 if(LITE_WITH_FPGA)

--- a/lite/backends/hw_ascend_npu/CMakeLists.txt
+++ b/lite/backends/hw_ascend_npu/CMakeLists.txt
@@ -2,12 +2,24 @@ if(NOT LITE_WITH_HW_ASCEND_NPU)
  return()
 endif()

+lite_cc_library(build_hw_ascend_npu SRCS build.cc DEPS
+  hw_ascend_npu_libs)
+
+lite_cc_library(device_hw_ascend_npu SRCS device.cc DEPS
+  hw_ascend_npu_libs
+  build_hw_ascend_npu
+  )
 lite_cc_library(target_wrapper_hw_ascend_npu SRCS target_wrapper.cc DEPS
-  ${hw_ascend_npu_runtime_libs})
+  hw_ascend_npu_libs
+  build_hw_ascend_npu
+  )
 lite_cc_library(runtime_hw_ascend_npu SRCS runtime.cc DEPS
-  ${hw_ascend_npu_runtime_libs}
-  target_wrapper_hw_ascend_npu)
-lite_cc_library(device_hw_ascend_npu SRCS device.cc DEPS
-  ${hw_ascend_npu_runtime_libs}
+  hw_ascend_npu_libs
  target_wrapper_hw_ascend_npu
-  runtime_hw_ascend_npu)
+  device_hw_ascend_npu
+  build_hw_ascend_npu
+  )
+
+add_executable(test_build test_build.cc)
+
+target_link_libraries(test_build build_hw_ascend_npu)
--- a/lite/backends/hw_ascend_npu/device.cc
+++ b/lite/backends/hw_ascend_npu/device.cc
@@ -16,7 +16,7 @@
 #include <map>
 #include <string>
 #include "ge/ge_api_types.h"
-#include "lite/backends/hw_ascend_npu/runtime.h"
+#include "lite/backends/hw_ascend_npu/build.h"
 #include "lite/utils/cp_logging.h"

 namespace paddle {
@@ -26,33 +26,92 @@ std::shared_ptr<HWAscendNPURuntime> Device::Build(
    std::vector<ge::Operator>& input_nodes,  // NOLINT
    std::vector<ge::Operator>& output_nodes  // NOLINT
    ) {
-  VLOG(3) << "[HWAscendNPU] Build model";
-  // Build the IR graph to the om model
-  ge::Graph ir_graph("graph");
-  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
-  ge::ModelBufferData model;
-
-  std::map<std::string, std::string> build_options;
-  build_options.insert({ge::ir_option::EXEC_DISABLE_REUSED_MEMORY, "1"});
-
-  ge::graphStatus ret = aclgrphBuildModel(ir_graph, build_options, model);
-
-  if (ret != ge::GRAPH_SUCCESS) {
-    LOG(ERROR) << "[HWAscendNPU] Build model failed, error code: " << ret;
+  std::shared_ptr<ge::ModelBufferData> model_data =
+      paddle::lite::hw_ascend_npu::Build(input_nodes, output_nodes);
+  if (model_data == nullptr) {
+    LOG(ERROR) << "[HWAscendNPU] Build model failed";
    return nullptr;
  }
+  LOG(INFO) << "[HWAscendNPU] Build model success";

+  if (!inited_) {
+    if (0 == InitDevice()) {
+      LOG(INFO) << "Init success.";
+      inited_ = true;
+    }
+  }
  std::shared_ptr<HWAscendNPURuntime> model_runtime(
-      new HWAscendNPURuntime(model.data, model.length));
+      new HWAscendNPURuntime(model_data->data, model_data->length));
  CHECK(model_runtime != nullptr);
  if (!model_runtime->model_loaded()) {
    LOG(ERROR) << "[HWAscendNPU]: Can not create model runtime instance";
    return nullptr;
  }
-  VLOG(3) << "[HWAscendNPU]: Build done";
+  LOG(INFO) << "[HWAscendNPU]: Build done";
  return model_runtime;
 }

+int Device::InitDevice() {
+  const char* acl_conf = "/usr/local/acl.json";
+  aclError ret = aclInit(acl_conf);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "[HWAscendNPU] acl init failed";
+    return -1;
+  }
+
+  // open device
+  ret = aclrtSetDevice(device_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "[HWAscendNPU] acl open device " << device_id_ << " failed";
+    return -1;
+  }
+
+  ret = aclrtCreateContext(&context_ptr_, device_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "acl create context failed";
+    return -1;
+  }
+
+  // create stream
+  ret = aclrtCreateStream(&stream_ptr_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "[HWAscendNPU] acl create stream failed";
+    return -1;
+  }
+
+  // get run mode
+  aclrtGetRunMode runMode;
+  ret = aclrtGetMode(&runMode);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "[HWAscendNPU] acl get run mode failed";
+    return -1;
+  }
+  is_devcie_ = (runMode == ACL_DEVICE);
+  LOG(INFO) << "[HWAscendNPU] Hardware initialization done";
+  return 0;
+}
+
+void Device::ReleaseDevice() {
+  aclError ret;
+  if (stream_ptr_ != nullptr) {
+    ret = aclrtDestroyStream(stream_ptr_);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HWAscendNPU] destroy stream failed";
+    }
+    stream_ptr_ = nullptr;
+  }
+  LOG(INFO) << "[HWAscendNPU] end to destroy stream";
+
+  if (context_ptr_ != nullptr) {
+    ret = aclrtDestroyContext(context_ptr_);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HWAscendNPU] destroy context failed";
+    }
+    context_ptr_ = nullptr;
+  }
+  LOG(INFO) << "[HWAscendNPU] Release device successfully";
+}
+
 }  // namespace hw_ascend_npu
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/hw_ascend_npu/device.h
+++ b/lite/backends/hw_ascend_npu/device.h
@@ -18,8 +18,9 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "ge/ge_ir_build.h"  // NOLINT
+#include "ge/ge_ir_build.h"
 #include "lite/backends/hw_ascend_npu/runtime.h"
+#include "lite/utils/cp_logging.h"
 namespace paddle {
 namespace lite {
 namespace hw_ascend_npu {
@@ -30,12 +31,11 @@ class Device {
    static Device x;
    return x;
  }
-  Device() {}
+  Device() : inited_(false) {}

-  int freq_level() { return freq_level_; }
-  int framework_type() { return framework_type_; }
-  int model_type() { return model_type_; }
-  int device_type() { return device_type_; }
+  ~Device() { ReleaseDevice(); }
+
+  bool is_device() const { return is_devcie_; }

  // Build the IR graph to om model, return a HWAscendNPURuntime instance to
  // load om model and run inference.
@@ -45,10 +45,15 @@ class Device {
      );                                       // NOLINT

 private:
-  int freq_level_{3};
-  int framework_type_{0};
-  int model_type_{0};
-  int device_type_{0};
+  int InitDevice();
+  void ReleaseDevice();
+
+ private:
+  bool inited_{false};
+  int device_id_{0};
+  bool is_devcie_{false};
+  aclrtContext context_ptr_{nullptr};
+  aclrtStream stream_ptr_{nullptr};
 };

 }  // namespace hw_ascend_npu

--- a/lite/backends/hw_ascend_npu/runtime.cc
+++ b/lite/backends/hw_ascend_npu/runtime.cc
@@ -42,12 +42,15 @@ int HWAscendNPURuntime::LoadModelFromMem(
                                        &model_size_,
                                        &model_weights_size_);
  if (ret != ACL_ERROR_NONE) {
-    LOG(ERROR) << "[HWAscendNPU]: Can query size from a built model buffer, "
+    LOG(ERROR) << "[HWAscendNPU]: Can't query size from a built model buffer, "
                  "error code: "
-               << ret;
+               << ret << ", model buffer size: " << model_buff_size;
    return ret;
  }

+  LOG(INFO) << "[HWAscendNPU]: Query model info success, model_size: "
+            << model_size_ << ", model weights_size_: " << model_weights_size_;
+
  ret = aclrtMalloc(&model_ptr_, model_size_, ACL_MEM_MALLOC_NORMAL_ONLY);
  if (ret != ACL_ERROR_NONE) {
    LOG(ERROR) << "[HWAscendNPU]: Can not allocate a device memory for model, "

--- a/lite/kernels/hw_ascend_npu/CMakeLists.txt
+++ b/lite/kernels/hw_ascend_npu/CMakeLists.txt
 add_subdirectory(bridges)

 add_kernel(subgraph_compute_hw_ascend_npu HW_ASCEND_NPU basic SRCS subgraph_compute.cc DEPS
-  ${lite_kernel_deps}
+  build_hw_ascend_npu
  device_hw_ascend_npu
  subgraph_bridge_engine
+  runtime_hw_ascend_npu
  ${hw_ascend_npu_subgraph_bridges}
  subgraph_bridge_registry
+  ${lite_kernel_deps}
  )
--- a/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt
+++ b/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt
@@ -4,18 +4,24 @@ endif()

 lite_cc_library(subgraph_bridge_utility_hw_ascend_npu
  SRCS utility.cc
-  DEPS ${hw_ascend_npu_builder_libs} tensor)
+  DEPS hw_ascend_npu_libs tensor)

 lite_cc_library(subgraph_bridge_graph_hw_ascend_npu
  SRCS graph.cc
-  DEPS ${hw_ascend_npu_builder_libs} subgraph_bridge_utility_hw_ascend_npu
-  )
+  DEPS hw_ascend_npu_libs subgraph_bridge_utility_hw_ascend_npu)

-set(hw_ascend_npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_hw_ascend_npu subgraph_bridge_graph_hw_ascend_npu)
+
+set(hw_ascend_npu_subgraph_bridge_deps
+  subgraph_bridge_registry
+  subgraph_bridge_utility_hw_ascend_npu
+  subgraph_bridge_graph_hw_ascend_npu)

 lite_cc_library(subgraph_bridge_act_op_hw_ascend_npu SRCS act_op.cc DEPS
-  ${hw_ascend_npu_subgraph_bridge_deps}
-  ${hw_ascend_npu_builder_libs})
+  hw_ascend_npu_libs
+  ${hw_ascend_npu_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_concat_op_hw_ascend_npu SRCS concat_op.cc DEPS
+#  ${hw_ascend_npu_subgraph_bridge_deps}
+#  hw_ascend_npu_builder_libs)

 set(hw_ascend_npu_subgraph_bridges
  subgraph_bridge_graph_hw_ascend_npu

--- a/lite/kernels/hw_ascend_npu/bridges/act_op.cc
+++ b/lite/kernels/hw_ascend_npu/bridges/act_op.cc
@@ -30,7 +30,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto op_info = op->op_info();
  auto op_type = op_info->Type();
  auto scope = op->scope();
-  VLOG(3) << "[HWAscendNPU] Converting " + op_type + "...";
+  LOG(INFO) << "[HWAscendNPU] Converting " + op_type + "...";

  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
@@ -64,28 +64,35 @@ int ActConverter<ge::op::Activation>(void* ctx,
  auto op_info = op->op_info();
  auto op_type = op_info->Type();
  auto scope = op->scope();
-  VLOG(3) << "[HWAscendNPU] Converting " + op_type + "...";
+  LOG(INFO) << "[HWAscendNPU] Converting " + op_type + "...";

  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
  auto out_name = op_info->Output("Out").front();
+  LOG(INFO) << "[HWAscendNPU] xname: " << x_name << ", dims: " << x_dims;

  // X node
  std::shared_ptr<Node> x_node = nullptr;
  if (graph->Has(x_name)) {
+    LOG(INFO) << "[HWAscendNPU] graph has node: " << x_name;
    x_node = graph->Get(x_name);
  } else {
+    LOG(INFO) << "[HWAscendNPU] graph does no have node: " << x_name;
    x_node = graph->Add(x_name, *x);
  }
+  LOG(INFO) << "[HWAscendNPU] out name: " << out_name;

+#if 0
  // Act node
  auto act_node = graph->template Add<ge::op::Activation>(out_name);
  auto act_op = act_node->template data<ge::op::Activation>();
  act_op->set_input_x(*x_node->data());
  // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
  // clipped_relu etc.
+  LOG(INFO) << "[HWAscendNPU] activation mode: " << op_type
+      << ", type: " << CvtActMode(op_type);
  act_op->set_attr_mode(CvtActMode(op_type));
  if (op_type == "relu_clipped") {
    auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
@@ -94,6 +101,12 @@ int ActConverter<ge::op::Activation>(void* ctx,
    float Relu_clipped_coef = 6.f;
    act_op->set_attr_coef(Relu_clipped_coef);
  }
+#else
+  // Act node
+  auto act_node = graph->template Add<ge::op::Relu>(out_name);
+  auto act_op = act_node->template data<ge::op::Relu>();
+  act_op->set_input_x(*x_node->data());
+#endif
  return SUCCESS;
 }


--- a/lite/kernels/hw_ascend_npu/bridges/concat_op.cc
+++ b/lite/kernels/hw_ascend_npu/bridges/concat_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <all_ops.h>
+#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
+#include "lite/kernels/hw_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace hw_ascend_npu {
+
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HWAscendNPU] Converting " << op_type << " ... ";
+
+  // Get input and output vars and op attributes
+  auto x_names = op_info->Input("X");
+  auto out_name = op_info->Output("Out").front();
+  auto axis = op_info->GetAttr<int>("axis");
+  auto num = x_names.size();
+
+  // Traverse all of input nodes which are added into the new created concat
+  // node
+  auto concat_node = graph->Add<ge::op::Concat>(out_name);
+  auto concat_op = concat_node->data<ge::op::Concat>();
+  concat_op->set_input_concat_dim(axis);
+  concat_op->set_attr_N(num);
+  concat_op->create_dynamic_input_input_values(num);
+  int idx = 1;
+  for (auto& x_name : x_names) {
+    auto x = scope->FindMutableTensor(x_name);
+    auto x_dims = x->dims();
+    std::shared_ptr<Node> x_node = nullptr;
+    if (graph->Has(x_name)) {
+      x_node = graph->Get(x_name);
+    } else {
+      x_node = graph->Add(x_name, *x);
+    }
+    concat_op->set_dynamic_input_input_values(idx, *x_node->data());
+    idx++;
+  }
+  return SUCCESS;
+}
+
+}  // namespace hw_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(
+    concat,
+    kHWAscendNPU,
+    paddle::lite::subgraph::hw_ascend_npu::ConcatConverter);
--- a/lite/kernels/hw_ascend_npu/bridges/conv_op.cc
+++ b/lite/kernels/hw_ascend_npu/bridges/conv_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
+#include "lite/kernels/hw_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace hw_ascend_npu {
+
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HwAscendNPU] Converting " << op_type << "... ";
+
+  // Get input and output vars and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+
+  auto output_name = op_info->Output("Output").front();
+  auto output = scope->FindMutableTensor(output_name);
+  auto output_dims = output->dims();
+
+  auto bs = input_dims[0];
+  auto ic = input_dims[1];
+  auto oc = filter_dims[0];
+  CHECK_EQ(input_dims.size(), 4L);
+  CHECK_EQ(output_dims.size(), 4L);
+  CHECK_EQ(filter_dims.size(), 4L);
+  CHECK_EQ(output_dims[0], bs);
+  CHECK_EQ(output_dims[1], oc);
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  bool with_act =
+      op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
+  std::string act_type =
+      with_act ? op_info->GetAttr<std::string>("act_type") : "";
+  float leaky_relu_alpha = act_type == "leaky_relu"
+                               ? op_info->GetAttr<float>("leaky_relu_alpha")
+                               : 0.f;
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  // Input node
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L) << "[HwAscendNPU] Paddings size should be the "
+                                   "same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
+  // Check depthwise mode, and decide whether use ConvolutionDepthwise Op
+  bool use_depthwise_conv =
+      false;  // Whether use ge::op::ConvolutionDepthwise ?
+  bool is_depthwise_mode = ic == groups && oc == groups;
+  if (is_depthwise_mode &&
+      !((groups == 1 || groups >= 5) && dilations[0] == 1 &&
+        dilations[1] == 1)) {
+    use_depthwise_conv = true;
+    LOG(WARNING)
+        << "[HwAscendNPU] For depthwise mode, dilation = 1 and groups >= 5 "
+           "(or groups = 1) is only supported in Convolution Op, so "
+           "force to use ConvolutionDepthwise Op, but may lead poor "
+           "performance.";
+  }
+
+  // Filter node
+  auto filter_node = graph->Add(filter_name, *filter);
+
+  // Add bias node if exists bias
+  // Supports the bias nodes with the following dimensions
+  // 0: {oc}
+  // 1: {1, oc, oh, ow}
+  // 2: {n, oc, oh, ow}
+  std::shared_ptr<Node> bias_node = nullptr;
+  bool is_channel_bias = false;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      auto output_data_size = output_dims.production();
+      std::vector<int64_t> bias_shape;
+      if (bias_data_size == oc) {
+        // 0: {oc}
+        bias_shape = {1, oc, 1, 1};
+        is_channel_bias = true;
+      } else if (bias_data_size == output_data_size / bs) {
+        // 1: {1, oc, oh, ow}
+        bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
+      } else if (bias_data_size == output_data_size) {
+        // 2: {n, oc, oh, ow}
+        bias_shape = output_dims.Vectorize();
+      } else {
+        LOG(WARNING)
+            << "[HwAscendNPU] Bias dimension " << bias_dims
+            << " isn't supported in conv2d Op when output dimension is "
+            << output_dims;
+        return FAILED;
+      }
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
+    }
+  }
+
+  // Conv node
+  std::shared_ptr<Node> conv_node = nullptr;
+  if (use_depthwise_conv && is_depthwise_mode) {
+    conv_node = graph->Add<ge::op::DepthwiseConv2D>(output_name);
+    auto conv_op = conv_node->data<ge::op::ConvolutionDepthwise>();
+    conv_op->set_input_x(*input_node->data());
+    conv_op->set_input_filter(*filter_node->data());
+    conv_op->set_attr_mode(1);
+    conv_op->set_attr_algo(0);
+    conv_op->set_attr_format(0);    // NCHW
+    conv_op->set_attr_pad_mode(5);  // VALID
+    conv_op->set_attr_group(groups);
+    conv_op->set_attr_pad(ge::AttrValue::LIST_INT(
+        {paddings[0], paddings[1], paddings[2], paddings[3]}));
+    conv_op->set_attr_dilation(
+        ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
+    conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]}));
+    conv_op->set_attr_kernel(
+        ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
+    // ConvolutionDepthwise Op doesn't support bias, so append Add node to
+    // support bias
+    if (bias_node != nullptr) {
+      auto add_node = graph->Add<ge::op::Add>(output_name);
+      auto add_op = add_node->data<ge::op::Add>();
+      add_op->set_input_x1(*conv_node->data());
+      add_op->set_input_x2(*bias_node->data());
+      conv_node = add_node;
+    }
+  } else {
+    conv_node = graph->Add<ge::op::Convolution>(output_name);
+    auto conv_op = conv_node->data<ge::op::Convolution>();
+    conv_op->set_input_x(*input_node->data());
+    conv_op->set_input_w(*filter_node->data());
+    conv_op->set_attr_mode(1);
+    // when padding_algorithm=="SAME", NPU is different from lite
+    if (padding_algorithm == "VALID") {
+      conv_op->set_attr_pad_mode(5);
+    } else {
+      conv_op->set_attr_pad_mode(0);
+    }
+    conv_op->set_attr_group(groups);
+    conv_op->set_attr_pad(ge::AttrValue::LIST_INT(
+        {paddings[0], paddings[1], paddings[2], paddings[3]}));
+    conv_op->set_attr_dilation(
+        ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
+    conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]}));
+    conv_op->set_attr_kernel(
+        ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
+    // Convolution Op only support bias with dimension {1, oc, 1, 1},
+    // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
+    if (bias_node != nullptr) {
+      if (is_channel_bias) {
+        conv_op->set_input_b(*bias_node->data());
+      } else {
+        auto add_node = graph->Add<ge::op::Add>(output_name);
+        auto add_op = add_node->data<ge::op::Add>();
+        add_op->set_input_x1(*conv_node->data());
+        add_op->set_input_x2(*bias_node->data());
+        conv_node = add_node;
+      }
+    }
+  }
+  CHECK(conv_node);
+
+  if (!act_type.empty()) {
+    auto act_node = graph->Add<ge::op::Activation>(output_name);
+    auto act_op = act_node->data<ge::op::Activation>();
+    act_op->set_input_x(*conv_node->data());
+    act_op->set_attr_mode(CvtActMode(act_type));
+    if (act_type == "leaky_relu") {
+      act_op->set_attr_negative_slope(leaky_relu_alpha);
+    } else if (act_type == "relu6") {
+      act_op->set_attr_coef(6.f);
+    }
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace hw_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kHWAscendNPU,
+                         paddle::lite::subgraph::npu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kHWAscendNPU,
+                         paddle::lite::subgraph::npu::ConvConverter);
--- a/lite/kernels/hw_ascend_npu/bridges/graph.cc
+++ b/lite/kernels/hw_ascend_npu/bridges/graph.cc
@@ -27,7 +27,8 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
  if (it != nodes_.end()) {
    // Only variable node can be shared with the same name
    if (!node->is_var() || !it->second.back()->is_var()) {
-      LOG(FATAL) << "[NPU] Const or data node " << name << " is redefined.";
+      LOG(FATAL) << "[HWAscendNPU] Const or data node " << name
+                 << " is redefined.";
      return -1;
    }
  } else {
@@ -65,6 +66,13 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
                                 PrecisionType precision,
                                 DataLayoutType layout) {
  auto node = Add<ge::op::Data>(name, precision, layout);
+  std::stringstream iss;
+  iss << "[HWAscendNPU] Add data node, shape: ";
+  for (auto& s : shape) {
+    iss << s << ",";
+  }
+  iss << " name: " << name;
+  LOG(INFO) << iss.str();
  ge::TensorDesc desc(
      ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision));
  node->data<ge::op::Data>()->update_input_desc_data(desc);

--- a/lite/kernels/hw_ascend_npu/bridges/graph.h
+++ b/lite/kernels/hw_ascend_npu/bridges/graph.h
@@ -181,13 +181,14 @@ class Graph {
  }

  std::shared_ptr<Node> Get(std::string name) {
-    CHECK(Has(name)) << "[NPU] Node " << name << " not found.";
+    CHECK(Has(name)) << "[HWAscendNPU] Node " << name << " not found.";
    return nodes_.at(name).back();
  }

  bool Has(const std::string& name) {
    return nodes_.find(name) != nodes_.end();
  }
+  size_t size() const { return nodes_.size(); }

 private:
  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;

--- a/lite/kernels/hw_ascend_npu/bridges/utility.h
+++ b/lite/kernels/hw_ascend_npu/bridges/utility.h
@@ -47,7 +47,7 @@ ge::Tensor CvtTensor(const Tensor& in_tensor,
                     std::vector<int64_t> out_shape = {},
                     DataLayoutType in_layout = DATALAYOUT(kNCHW));

-int CvtActMode(std::string act_type);
+int CvtActMode(const std::string& act_type);
 }  // namespace hw_ascend_npu
 }  // namespace subgraph
 }  // namespace lite

--- a/lite/kernels/hw_ascend_npu/subgraph_compute.cc
+++ b/lite/kernels/hw_ascend_npu/subgraph_compute.cc
@@ -32,22 +32,31 @@ int SubgraphEngine::BuildDeviceProgram() {
  // the HWAscendNPU IR graph
  subgraph::hw_ascend_npu::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
+  LOG(INFO) << "[HWAscendNPU] Build device program";
  for (auto& inst : origin_program_) {
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
    op->CheckShape();
    op->InferShape();
    std::string op_type = op->op_info()->Type();
+    LOG(INFO) << "[HWAscendNPU] trying to convert OP: " << op_type;
    if (!bridges.Exists(op_type, TARGET(kHWAscendNPU))) {
+      LOG(ERROR) << "[HWAscendNPU] OP: " << op_type
+                 << " does not exist for target HWAscendNPU";
      return subgraph::FAILED;
    }
+    LOG(INFO) << "[HWAscendNPU] OP: " << op_type << " exists for HWAscendNPU";
    auto kernel = inst.kernel();
    status |= bridges.Select(op_type, TARGET(kHWAscendNPU))(
        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
    if (subgraph::CHECK_FAILED(status)) {
+      LOG(ERROR) << "[HWAscendNPU] OP: " << op_type << " select kernel failed";
      return subgraph::FAILED;
    }
+    LOG(INFO) << "[HWAscendNPU] OP: " << op_type
+              << " select kernel for HWAscendNPU";
  }
+  LOG(INFO) << "[HWAscendNPU] Graph size: " << graph.size();
  // Collect the valid input and output nodes in the HiAI IR graph and update
  // the input and output names
  device_inames_.clear();
@@ -55,8 +64,12 @@ int SubgraphEngine::BuildDeviceProgram() {
  std::vector<ge::Operator> device_inodes;
  std::vector<ge::Operator> device_onodes;
  for (auto& input_name : input_names_) {
+    LOG(INFO) << "[HWAscendNPU] input name: " << input_name;
    if (graph.Has(input_name)) {
+      LOG(INFO) << "[HWAscendNPU] Graph has input name: " << input_name;
      if (graph.Get(input_name)->is_data()) {
+        LOG(INFO) << "[HWAscendNPU] the current input name: " << input_name
+                  << " is data";
        device_inodes.push_back(*graph.Get(input_name)->data());
        device_inames_.push_back(input_name);
      } else {
@@ -82,10 +95,15 @@ int SubgraphEngine::BuildDeviceProgram() {
  CHECK(!device_onames_.empty())
      << "[HWAscendNPU] No output nodes found for building NPU model";

+  LOG(INFO) << "[HWAscendNPU] Graph size to build: " << graph.size();
+
  // Build the IR graph to om model as the device program
  if (device_program_map_.count(inputs_shape_) > 0) {
    return status;
  }
+  LOG(INFO) << "[HWAscendNPU] Start to build, device_inodes = "
+            << device_inodes.size()
+            << ", device_onodes = " << device_onodes.size();
  auto device_client =
      lite::hw_ascend_npu::Device::Global().Build(device_inodes, device_onodes);
  if (device_client == nullptr) {
@@ -188,12 +206,14 @@ int SubgraphEngine::LaunchDeviceProgram() {
  // tensors
  auto device_program = device_program_map_[inputs_shape_];
  int ret = 0;
+  LOG(INFO) << "[HWAscendNPU] start to set input...";

  ret = device_program->client->SetInput(origin_itensors_,
                                         device_program->origin_idims);
  if (ret != 0) {
    return ret;
  }
+  LOG(INFO) << "[HWAscendNPU] start to create output...";

  device_program->client->CreateOutput(device_program->origin_odims);

@@ -205,10 +225,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
  };
  auto start_time = GetCurrentUS();
  CHECK_EQ(device_program->client->Process(), 0);
-  VLOG(3) << "[HWAscendNPU] Process cost " << GetCurrentUS() - start_time
+  LOG(INFO) << "[HWAscendNPU] Process cost " << GetCurrentUS() - start_time
            << " us";

  device_program->client->GetOutput(&origin_otensors_);
+  LOG(INFO) << "[HWAscendNPU] Get ouput done";

  return 0;
 }
@@ -238,7 +259,9 @@ void SubgraphCompute::PrepareForRun() {
 }

 void SubgraphCompute::Run() {
+  LOG(INFO) << "[HWAscendNPU] Start to run";
  CHECK(engine_);
+  LOG(INFO) << "[HWAscendNPU] Start to call Launch";
  engine_->Launch();
 }


--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
@@ -105,15 +105,19 @@ bool Engine::InputShapeChanged() {
 }

 int Engine::Launch() {
+  LOG(INFO) << "[HWAscendNPU] in Launch, start to build if needed";
  // Rebuild device program when the shapes of input tensors have been changed.
  if (CHECK_SUCCESS(build_device_program_status_) &&
      CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) &&
      InputShapeChanged()) {
    Build();
  }
+  LOG(INFO) << "[HWAscendNPU] launch program";
  if (CHECK_FAILED(build_device_program_status_)) {
+    LOG(INFO) << "[HWAscendNPU] launch original program";
    LaunchOriginProgram();
  } else {
+    LOG(INFO) << "[HWAscendNPU] launch device program";
    LaunchDeviceProgram();
  }
  return 0;

--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -9,7 +9,8 @@ set (kernels
  ${host_kernels}
  ${hw_ascend_npu_kernels})

-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_HW_ASCEND_NPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+message(STATUS "======---------------------------------=================${hw_ascend_npu_kernels}")
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
    lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels})
    lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels})
    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels})

--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -293,6 +293,7 @@ TEST(Activation_relu, precision) {
  place = TARGET(kXPU);
 #elif defined(LITE_WITH_HW_ASCEND_NPU)
  place = TARGET(kHWAscendNPU);
+  std::cout << "-----------test relu with hw_ascend_npu" << std::endl;
 #else
  return;
 #endif

--- a/lite/tools/build_hw_ascend_npu.sh
+++ b/lite/tools/build_hw_ascend_npu.sh
@@ -3,7 +3,7 @@ set -ex

 # global variables with default value
 ASCEND_HOME="/usr/local/Ascend"    # Ascend SDK root directory
-TARGET_NAME="test_subgraph_pass"    # default target
+TARGET_NAME="test_kernel_activation_compute"    # default target
 BUILD_EXTRA=ON                      # ON(with sequence ops)/OFF
 WITH_TESTING=ON                     # ON/OFF

@@ -80,7 +80,7 @@ function build_hw_ascend_npu {
        -DWITH_TESTING=${WITH_TESTING} \
        -DASCEND_HOME=${HW_ASCEND_NPU_SDK_ROOT}

-    make -j$NUM_CORES_FOR_COMPILE
+    make $TARGET_NAME -j2

    cd -
    echo "Done"