[LITE][NPU][XPU] Refine subgraph pass, and support NPU/XPU model generation at...

[LITE][NPU][XPU] Refine subgraph pass, and support NPU/XPU model generation at execution time (#2576)

[LITE][NPU][XPU] Refine subgraph pass, and support NPU/XPU model generation at...
[LITE][NPU][XPU] Refine subgraph pass, and support NPU/XPU model generation at execution time (#2576)
d5434aa2 · hong19860320 · GitHub · d8750966 · d5434aa2 · d5434aa2
99 changed file
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -118,7 +118,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS XPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -128,10 +128,10 @@ function(lite_cc_library TARGET)
            X86_DEPS ${args_X86_DEPS}
            CUDA_DEPS ${args_CUDA_DEPS}
            CL_DEPS ${args_CL_DEPS}
-            NPU_DEPS ${args_NPU_DEPS}
-            XPU_DEPS ${args_XPU_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
+            NPU_DEPS ${args_NPU_DEPS}
+            XPU_DEPS ${args_XPU_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
@@ -161,7 +161,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -173,6 +173,8 @@ function(lite_cc_binary TARGET)
            CL_DEPS ${args_CL_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
+            NPU_DEPS ${args_NPU_DEPS}
+            XPU_DEPS ${args_XPU_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
@@ -205,7 +207,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -225,6 +227,8 @@ function(lite_cc_test TARGET)
              CL_DEPS ${args_CL_DEPS}
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -267,7 +271,7 @@ endif()
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -360,11 +364,12 @@ function(add_kernel TARGET device level)
    lite_cc_library(${TARGET} SRCS ${args_SRCS}
              DEPS ${args_DEPS}
              X86_DEPS ${args_X86_DEPS}
-              XPU_DEPS ${args_XPU_DEPS}
              CUDA_DEPS ${args_CUDA_DEPS}
              CL_DEPS ${args_CL_DEPS}
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -383,7 +388,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -409,11 +414,12 @@ function(add_operator TARGET level)
    lite_cc_library(${TARGET} SRCS ${args_SRCS}
              DEPS ${args_DEPS}
              X86_DEPS ${args_X86_DEPS}
-              XPU_DEPS ${args_XPU_DEPS}
              CUDA_DEPS ${args_CUDA_DEPS}
              CL_DEPS ${args_CL_DEPS}
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/cmake/xpu.cmake
+++ b/cmake/xpu.cmake
@@ -89,7 +89,7 @@ else()
 endif()
 find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib/gcc482)
 if(NOT XPU_SDK_LLVM_FILE)
  message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -42,7 +42,7 @@ else()
       add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
        if (LITE_WITH_NPU)
            # Need to add HIAI runtime libs (libhiai.so) dependency
-            target_link_libraries(paddle_light_api_shared ${npu_runtime_libs})
+            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
        endif()
    endif()
 endif()
@@ -78,8 +78,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
                    DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
                    X86_DEPS ${x86_kernels}
                    ARM_DEPS ${arm_kernels}
-                    NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
+                    NPU_DEPS ${npu_kernels}
-                    XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
+                    XPU_DEPS ${xpu_kernels}
                    CL_DEPS ${opencl_kernels}
                    FPGA_DEPS ${fpga_kernels})
 endif()

--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
@@ -108,7 +108,7 @@ USE_LITE_OP(while)
 USE_LITE_OP(lod_reset)
 USE_LITE_OP(lookup_table)
 USE_LITE_OP(multiclass_nms)
-USE_LITE_OP(graph_op)
+USE_LITE_OP(subgraph)
 USE_LITE_OP(sequence_expand)
 USE_LITE_OP(sequence_pool)
 USE_LITE_OP(reduce_max)

--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -30,7 +30,7 @@ else()
    add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
    if (LITE_WITH_NPU)
        # Need to add HIAI runtime libs (libhiai.so) dependency
-        target_link_libraries(paddle_lite_jni ${npu_runtime_libs})
+        target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})
    endif()
 endif()

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -139,22 +139,15 @@ std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
-  std::vector<const cpp::OpDesc *> feeds;
-  std::vector<const cpp::OpDesc *> fetchs;
-#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
-  // The shape of input tensors must be determined before generating NPU and XPU
-  // program.
-  auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
-  for (size_t i = 0; i < current_block->OpsSize(); i++) {
-    auto op = current_block->GetOp<cpp::OpDesc>(i);
-#else
  if (!program_) {
    GenRuntimeProgram();
  }
+  std::vector<const cpp::OpDesc *> feeds;
+  std::vector<const cpp::OpDesc *> fetchs;
  const auto &insts = program_->instructions();
  for (size_t i = 0; i < program_->num_instructions(); i++) {
    const auto &op = insts[i].op()->op_info();
-#endif
    if (op->Type() == "feed") {
      feeds.push_back(op);
    } else if (op->Type() == "fetch") {

--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -90,6 +90,10 @@ std::vector<Place> ParserValidPlaces() {
          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
    } else if (target_repr == "x86") {
      valid_places.emplace_back(TARGET(kX86));
+    } else if (target_repr == "npu") {
+      valid_places.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "xpu") {
+      valid_places.emplace_back(TARGET(kXPU));
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -20,12 +20,6 @@ USE_MIR_PASS(static_kernel_pick_pass);
 USE_MIR_PASS(variable_place_inference_pass);
 USE_MIR_PASS(type_target_cast_pass);
 USE_MIR_PASS(generate_program_pass);
-#ifdef LITE_WITH_NPU
-USE_MIR_PASS(generate_npu_program_pass);
-#endif
-#ifdef LITE_WITH_XPU
-USE_MIR_PASS(generate_xpu_program_pass);
-#endif
 USE_MIR_PASS(io_copy_kernel_pick_pass);
 USE_MIR_PASS(argument_type_display_pass);
@@ -45,3 +39,5 @@ USE_MIR_PASS(lite_quant_dequant_fuse_pass);
 USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
 USE_MIR_PASS(memory_optimize_pass);
+USE_MIR_PASS(npu_subgraph_pass);
+USE_MIR_PASS(xpu_subgraph_pass);
--- a/lite/backends/npu/CMakeLists.txt
+++ b/lite/backends/npu/CMakeLists.txt
@@ -2,5 +2,4 @@ if(NOT LITE_WITH_NPU)
  return()
 endif()
-lite_cc_library(npu_runtime SRCS runtime.cc DEPS ${npu_runtime_libs})
+lite_cc_library(device_npu SRCS device.cc DEPS ${npu_builder_libs} ${npu_runtime_libs})
-lite_cc_library(npu_builder SRCS builder.cc DEPS ${npu_builder_libs} npu_runtime tensor op scope)
--- a/lite/backends/npu/runtime.cc
+++ b/lite/backends/npu/runtime.cc
@@ -12,47 +12,56 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/runtime.h"
+#include "lite/backends/npu/device.h"
-#include <string>
-#include <vector>
 #include "lite/utils/cp_logging.h"
 namespace paddle {
 namespace lite {
 namespace npu {
-// Create hiai model manager to load om model from lite tensor, and return the
+std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
-// manager and an unique model name
+    std::string& model_name,                 // NOLINT
-bool LoadModel(const lite::Tensor &model_data,
+    std::vector<ge::Operator>& input_nodes,  // NOLINT
-               std::shared_ptr<hiai::AiModelMngerClient> *model_client,
+    std::vector<ge::Operator>& output_nodes  // NOLINT
-               std::string *model_name) {
+    ) {
-  LOG(INFO) << "[NPU] Load model.";
+  VLOG(3) << "[NPU] Build model";
-  auto model_data_ptr = model_data.data<int8_t>();
+  // Build the HiAI IR graph to the HiAI om model
-  auto model_data_size = model_data.numel() * sizeof(int8_t);
+  ge::Graph ir_graph("graph");
-  if (model_data_ptr == nullptr || model_data_size == 0) {
+  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
-    return false;
+  ge::Model om_model("model", "model");
+  om_model.SetGraph(ir_graph);
+  domi::HiaiIrBuild ir_build;
+  domi::ModelBufferData om_model_buf;
+  if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
+    LOG(WARNING) << "[NPU] CreateModelBuff failed!";
+    return nullptr;
  }
-  *model_client = std::make_shared<hiai::AiModelMngerClient>();
+  if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
-  int ret = (*model_client)->Init(nullptr);
+    LOG(WARNING) << "[NPU] BuildIRModel failed!";
-  if (ret != hiai::AI_SUCCESS) {
+    ir_build.ReleaseModelBuff(om_model_buf);
-    LOG(WARNING) << "[NPU] AiModelMngerClient init failed(" << ret << ")!";
+    return nullptr;
-    return false;
  }
-  *model_name = "model.om";
+  // Create a HiAI model manager client to load the HiAI om model
+  std::unique_ptr<hiai::AiModelMngerClient> model_client(
+      new hiai::AiModelMngerClient());
+  if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
+    LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
+    ir_build.ReleaseModelBuff(om_model_buf);
+    return nullptr;
+  }
+  model_name = "model_" + std::to_string(model_count_++) + ".om";
  auto model_desc = std::make_shared<hiai::AiModelDescription>(
-      *model_name,
+      model_name, freq_level(), framework_type(), model_type(), device_type());
-      DeviceInfo::Global().freq_level(),
+  model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
-      DeviceInfo::Global().framework_type(),
-      DeviceInfo::Global().model_type(),
-      DeviceInfo::Global().device_type());
-  model_desc->SetModelBuffer(model_data_ptr, model_data_size);
  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs;
  model_descs.push_back(model_desc);
-  if ((*model_client)->Load(model_descs) != hiai::AI_SUCCESS) {
+  if (model_client->Load(model_descs) != hiai::AI_SUCCESS) {
    LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
-    return false;
+    ir_build.ReleaseModelBuff(om_model_buf);
+    return nullptr;
  }
-  return true;
+  ir_build.ReleaseModelBuff(om_model_buf);
+  return model_client;
 }
 }  // namespace npu

--- a/lite/backends/npu/runtime.h
+++ b/lite/backends/npu/runtime.h
@@ -13,38 +13,47 @@
 // limitations under the License.
 #pragma once
 #include <memory>
 #include <string>
+#include <unordered_map>
+#include <vector>
 #include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "lite/core/tensor.h"
+#include "ai_ddk_lib/include/hiai_ir_build.h"
 namespace paddle {
 namespace lite {
 namespace npu {
-class DeviceInfo {
+class Device {
 public:
-  static DeviceInfo &Global() {
+  static Device& Global() {
-    static DeviceInfo x;
+    static Device x;
    return x;
  }
-  DeviceInfo() {}
+  Device() {}
  int freq_level() { return freq_level_; }
  int framework_type() { return framework_type_; }
  int model_type() { return model_type_; }
  int device_type() { return device_type_; }
+  // Build the HiAI IR graph to om model, return HiAI model manager client to
+  // load om model and run inference.
+  std::unique_ptr<hiai::AiModelMngerClient> Build(
+      std::string& model_name,                 // NOLINT
+      std::vector<ge::Operator>& input_nodes,  // NOLINT
+      std::vector<ge::Operator>& output_nodes  // NOLINT
+      );                                       // NOLINT
 private:
  int freq_level_{3};
  int framework_type_{0};
  int model_type_{0};
  int device_type_{0};
+  int model_count_{0};
 };
-bool LoadModel(const lite::Tensor &model_data,
-               std::shared_ptr<hiai::AiModelMngerClient> *model_client,
-               std::string *model_name);
 }  // namespace npu
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/xpu/CMakeLists.txt
+++ b/lite/backends/xpu/CMakeLists.txt
@@ -2,5 +2,4 @@ if(NOT LITE_WITH_XPU)
  return()
 endif()
-lite_cc_library(xpu_runtime SRCS runtime.cc DEPS ${xpu_runtime_libs})
+lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
-lite_cc_library(xpu_builder SRCS builder.cc DEPS ${xpu_builder_libs} xpu_runtime tensor op scope)
--- a/lite/backends/xpu/runtime.cc
+++ b/lite/backends/xpu/runtime.cc
@@ -12,33 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/xpu/runtime.h"
+#include "lite/backends/xpu/device.h"
-#include <vector>
 #include "lite/utils/cp_logging.h"
 namespace paddle {
 namespace lite {
 namespace xpu {
-// Extract the model data and recover the XPU model for inference, the function
+std::unique_ptr<xtcl::network::xRuntimeInstance> Device::Build(
-// is called by the graph computing kernel when the graph op is executed.
+    xtcl::network::xNetworkBuilder* builder,
-// Due to the lack of XPU APIs for loading and recovering the XPU model from
+    xtcl::network::xTensorCompiler::ParamNDArrayMap* params,
-// memory, the key name is obtained from the weight tensor of graph op, to get
+    std::vector<xtcl::xExpr*>* outputs) {
-// the runtime object for inference from the global variable 'DeviceInfo'.
+  VLOG(3) << "[XPU] Build model";
-// TODO(hong19860320) Recover the XPU model from the weight tensor of graph op.
+  CHECK(builder != nullptr);
-bool LoadModel(const lite::Tensor &model,
+  CHECK(outputs != nullptr);
-               std::shared_ptr<xtcl::network::xRuntimeInstance> *runtime) {
+  CHECK_GT(outputs->size(), 0);
-  LOG(INFO) << "[XPU] Load Model.";
-  CHECK_GT(model.dims().production(), 0);
+  // The XPU compiler build the graph and fill all of the constant params, only
-  std::string name(reinterpret_cast<const char *>(model.data<int8_t>()));
+  // one output is supported now.
-  LOG(INFO) << "[XPU] Model Name: " << name;
+  xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0]));
-  CHECK(runtime != nullptr);
+  auto target = xtcl::Target::Create(device_name_);
-  *runtime = DeviceInfo::Global().Find(name);
+  auto compiler = xtcl::network::xTensorCompiler(network, target);
-  if (*runtime == nullptr) {
+  compiler.SetParams(*params);  // Set the data of constant tensors
-    LOG(WARNING) << "[XPU] Load Model failed!";
+  compiler.Build();
-    return false;
+  return std::unique_ptr<xtcl::network::xRuntimeInstance>(
-  }
+      new xtcl::network::xRuntimeInstance(compiler.CreateRuntimeInstance()));
-  return true;
 }
 }  // namespace xpu

--- a/lite/kernels/xpu/graph_compute.h
+++ b/lite/kernels/xpu/graph_compute.h
@@ -17,31 +17,34 @@
 #include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/types.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
 namespace xpu {
-class GraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+class Device {
 public:
-  using param_t = operators::GraphParam;
+  static Device& Global() {
+    static Device x;
-  void PrepareForRun() override;
+    return x;
+  }
-  void Run() override;
+  Device() {}
-  virtual ~GraphCompute() = default;
+  // Build the XPU graph to the XPU runtime, return the XPU runtime which can be
+  // used to run inference.
+  std::unique_ptr<xtcl::network::xRuntimeInstance> Build(
+      xtcl::network::xNetworkBuilder* builder,
+      xtcl::network::xTensorCompiler::ParamNDArrayMap* params,
+      std::vector<xtcl::xExpr*>* outputs);
 private:
-  std::shared_ptr<xtcl::network::xRuntimeInstance> runtime_{nullptr};
+  // Keep reserved fields
+  int device_id_{0};
+  std::string device_name_{"llvm"};
 };
 }  // namespace xpu
-}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -33,9 +33,9 @@ lite_cc_library(scope SRCS scope.cc DEPS tensor)
 lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
 if (LITE_WITH_ARM)
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags NPU_DEPS npu_runtime)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags)
 else()
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags XPU_DEPS xpu_runtime)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags)
 endif()
 #-------------------------------------------- GET CODE META INFO ------------------------------------------

--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -5,6 +5,6 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -14,13 +14,38 @@
 #include "lite/core/arena/framework.h"
 #include "lite/core/context.h"
+#include "lite/operators/subgraph_op.h"
 namespace paddle {
 namespace lite {
 namespace arena {
 void TestCase::CreateInstruction() {
-  auto op = LiteOpRegistry::Global().Create(op_desc().Type());
+  std::shared_ptr<lite::OpLite> op = nullptr;
+  if (place_.target == TARGET(kNPU) || place_.target == TARGET(kXPU)) {
+    // Create a new block desc to wrap the original op desc
+    int sub_block_idx = 0;
+    auto sub_block_desc = new cpp::BlockDesc();
+    sub_block_desc->ClearOps();
+    sub_block_desc->ClearVars();
+    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_block_op_desc = *op_desc_;
+    // Add the block desc into the subgraph op which used to replace the
+    // original op
+    op_desc_.reset(new cpp::OpDesc());
+    op_desc_->SetType("subgraph");
+    op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
+    op_desc_->SetInput("Inputs", op_desc_->input_vars());
+    op_desc_->SetOutput("Outputs", op_desc_->output_vars());
+    op_desc_->SetAttr<std::vector<std::string>>(
+        "input_data_names", sub_block_op_desc->input_vars());
+    op_desc_->SetAttr<std::vector<std::string>>(
+        "output_data_names", sub_block_op_desc->output_vars());
+    op = LiteOpRegistry::Global().Create(op_desc().Type());
+    static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
+  } else {
+    op = LiteOpRegistry::Global().Create(op_desc().Type());
+  }
  CHECK(op) << "no op for " << op_desc().Type();
  op->Attach(*op_desc_, inst_scope_);
  auto kernels = op->CreateKernels({place_});
@@ -68,6 +93,19 @@ void TestCase::PrepareInputsForInstruction() {
  }
 }
+TestCase::~TestCase() {
+  if (op_desc_->Type() == "subgraph") {
+    // Release the subblock desc of Subgraph op
+    auto subgraph_op = const_cast<operators::SubgraphOp*>(
+        static_cast<const operators::SubgraphOp*>(instruction_->op()));
+    CHECK(subgraph_op);
+    auto sub_block_desc = subgraph_op->GetSubBlock();
+    if (sub_block_desc) {
+      delete sub_block_desc;
+    }
+  }
+}
 }  // namespace arena
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -42,7 +42,7 @@ class TestCase {
      : place_(place), scope_(new Scope), alias_(alias) {
    ctx_ = ContextScheduler::Global().NewContext(place_.target);
  }
-  virtual ~TestCase() {}
+  virtual ~TestCase();
  void Prepare() {
    PrepareScopes();

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -25,12 +25,6 @@
 #include "lite/backends/opencl/cl_context.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
-#ifdef LITE_WITH_NPU
-#include "lite/backends/npu/runtime.h"
-#endif
-#ifdef LITE_WITH_XPU
-#include "lite/backends/xpu/runtime.h"
-#endif
 #include <map>
 #include <memory>
@@ -93,7 +87,7 @@ template <>
 class Context<TargetType::kXPU> {
 public:
  Context() {}
-  explicit Context(const NPUContext& ctx);
+  explicit Context(const XPUContext& ctx);
  // NOTE: InitOnce should only be used by ContextScheduler
  void InitOnce() {}
  void CopySharedTo(XPUContext* ctx) {}

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -32,7 +32,7 @@ lite_cc_library(mir_passes
      demo_pass.cc
      runtime_context_assign_pass.cc
      memory_optimize_pass.cc
-  DEPS mir_pass types context ${mir_fusers} ${subgraph_passes})
+  DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
 # lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
        #mir_ssa_graph scope op

--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -36,15 +36,6 @@ std::string Visualize(mir::SSAGraph* graph) {
  int id = 0;
  std::set<std::string> exists_args;
-  std::map<int, std::string> graph_col;  // Different colors of subgraphs
-  graph_col.insert({{1, "red"},
-                    {2, "green"},
-                    {3, "cyan"},
-                    {4, "bisque3"},
-                    {5, "coral"},
-                    {6, "darkseagreen1"},
-                    {7, "goldenrod1"},
-                    {8, "darkorchid"}});
  for (auto& node : graph->mutable_nodes()) {
    std::string key;
    if (node.IsArg()) {
@@ -52,24 +43,12 @@ std::string Visualize(mir::SSAGraph* graph) {
    } else {
      key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++);
    }
    if (node.IsStmt()) {
-      auto& stmt = node.AsStmt();
+      dot.AddNode(key,
-      auto sub_id = stmt.subgraph_id();
+                  {Dot::Attr("shape", "box"),
-      auto it = graph_col.find(sub_id);
+                   Dot::Attr("style", "filled"),
-      if (sub_id > 0 && it != graph_col.end()) {
+                   Dot::Attr("color", "black"),
-        dot.AddNode(key,
+                   Dot::Attr("fillcolor", "yellow")});
-                    {Dot::Attr("shape", "box"),
-                     Dot::Attr("style", "filled"),
-                     Dot::Attr("color", "black"),
-                     Dot::Attr("fillcolor", it->second)});
-      } else {
-        dot.AddNode(key,
-                    {Dot::Attr("shape", "box"),
-                     Dot::Attr("style", "filled"),
-                     Dot::Attr("color", "black"),
-                     Dot::Attr("fillcolor", "yellow")});
-      }
      for (auto& x : node.inlinks) {
        auto name = x->AsArg().name;
        if (!exists_args.count(name)) {

--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -50,7 +50,7 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
                                        "lod_reset",
                                        "concat",
                                        "yolo_box",
-                                        "graph_op",
+                                        "subgraph",
                                        "feed",
                                        "fetch"};
    for (auto* tmp : node->inlinks) {

--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -64,9 +64,6 @@ class Node {
      return valid_kernels_;
    }
-    void ClearSubgraphID() { subgraph_id_ = -1 /* note: not 0 */; }
-    void SetSubgraphID(int id) { subgraph_id_ = id; }
-    int subgraph_id() const { return subgraph_id_; }
    void SetOp(const std::shared_ptr<OpLite>& op) { op_ = op; }
    const std::shared_ptr<OpLite> op() const { return op_; }
@@ -82,11 +79,6 @@ class Node {
    // Description.
    std::string desc;
-   protected:
-    // -1 means not in subgraph, 0 means supported but not one id, id started
-    // from 1
-    int subgraph_id_{-1};
  };
  struct Arg {

--- a/lite/core/mir/subgraph/CMakeLists.txt
+++ b/lite/core/mir/subgraph/CMakeLists.txt
+lite_cc_library(subgraph_detector
+    SRCS subgraph_detector.cc
+    DEPS mir_pass types subgraph_op)
 lite_cc_library(subgraph_pass
-    SRCS subgraph_program_pass.cc
+    SRCS subgraph_pass.cc
-    DEPS mir_pass types ${mir_fusers})
+    DEPS mir_pass types context ${mir_fusers} subgraph_detector)
-lite_cc_test(test_subgraph_pass SRCS subgraph_program_pass_test.cc
-  DEPS subgraph_pass mir_passes gflags model_parser cxx_api
-  ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
 if (WITH_TESTING)
-  add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v1_tar_gz)
+    lite_cc_test(test_subgraph_detector
-  add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
+        SRCS subgraph_detector_test.cc
-  set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
+        DEPS subgraph_detector mir_passes gflags model_parser cxx_api
-  set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+        ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
-endif()
+    add_dependencies(test_subgraph_detector
+        extern_lite_download_mobilenet_v1_tar_gz
-set(subgraph_passes subgraph_pass)
+        extern_lite_download_mobilenet_v2_relu_tar_gz)
-if(LITE_WITH_NPU)
-  lite_cc_library(npu_pass SRCS generate_npu_program_pass.cc
-      DEPS mir_pass types context ${mir_fusers} ${npu_bridges} graph_op subgraph_pass)
-  list(APPEND subgraph_passes npu_pass)
-  lite_cc_test(test_npu_pass SRCS generate_npu_program_pass_test.cc
-    DEPS npu_pass mir_passes paddle_api_full paddle_api_light gflags
-    ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
-         --optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL)
-  if (WITH_TESTING)
-    add_dependencies(test_npu_pass extern_lite_download_mobilenet_v1_tar_gz)
-    add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
    set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    set_target_properties(test_npu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+    set_target_properties(test_subgraph_detector PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  endif()
+    lite_cc_test(test_subgraph_pass
-endif()
+        SRCS subgraph_pass_test.cc
+        DEPS mir_passes paddle_api_full paddle_api_light gflags
-if(LITE_WITH_XPU)
+        ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
-  lite_cc_library(xpu_pass SRCS generate_xpu_program_pass.cc
+             --optimized_model_dir=${LITE_MODEL_DIR}/lite_model_opt SERIAL)
-      DEPS mir_pass types context ${mir_fusers} ${xpu_bridges} ${xpu_builder_libs} graph_op subgraph_pass)
+    add_dependencies(test_subgraph_pass
-  list(APPEND subgraph_passes xpu_pass)
+        extern_lite_download_mobilenet_v1_tar_gz
-  lite_cc_test(test_xpu_pass SRCS generate_xpu_program_pass_test.cc
+        extern_lite_download_mobilenet_v2_relu_tar_gz)
-    DEPS xpu_pass mir_passes paddle_api_full gflags
-    ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
-         --optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL)
-  if (WITH_TESTING)
-    add_dependencies(test_xpu_pass extern_lite_download_mobilenet_v1_tar_gz)
-    add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
    set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    set_target_properties(test_xpu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+    set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  endif()
 endif()
-set(subgraph_passes ${subgraph_passes} CACHE INTERNAL "subgraph_passes")
+set(mir_subgraphs subgraph_pass CACHE INTERNAL "mir_subgraphs")
-message(STATUS "----> subgraph_passes: ${subgraph_passes}")
+message(STATUS "----> mir_subgraphs: ${mir_subgraphs}")
--- a/lite/core/mir/subgraph/generate_npu_program_pass.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/core/mir/subgraph/generate_npu_program_pass.h"
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher.h"
-#include "lite/backends/npu/builder.h"
-#include "lite/kernels/npu/bridges/paddle_use_npu_bridges.h"
-#include "lite/kernels/npu/bridges/registry.h"
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
-    lite::mir::Node* var_node, const Scope* scope) {
-  CHECK(var_node->IsArg());
-  const auto& arg = var_node->AsArg();
-  VLOG(4) << "[NPU] Convert var node " << arg.name;
-  auto* var = scope->FindVar(arg.name);
-  CHECK(var);
-  auto* tensor = var->GetMutable<lite::Tensor>();
-  CHECK(tensor);
-  auto dims = tensor->dims();
-  if (arg.is_weight) {
-    auto wgt = std::make_shared<ge::op::Const>(arg.name);
-    LOG(INFO) << "[NPU] Convert const var node " << arg.name;
-    VLOG(4) << dims;
-    wgt->set_attr_value(lite::npu::CvtTensor(tensor));
-    return wgt;
-  } else {
-    CHECK_EQ(dims.size(), 4);
-    LOG(INFO) << "[NPU] Convert data var node " << arg.name;
-    LOG(INFO) << dims;
-    // TODO(xxx): support more types and dims size
-    ge::TensorDesc desc(ge::Shape(dims.Vectorize()),
-                        ge::Format::FORMAT_NCHW,
-                        ge::DataType::DT_FLOAT);
-    //   auto size = desc.GetShape().GetShapeSize();
-    //  ge::TensorUtils::SetSize(desc, size*sizeof(float));
-    //  ge::TensorUtils::SetRealDimCnt(desc, 4);
-    auto data = std::make_shared<ge::op::Data>(arg.name);
-    data->update_input_desc_x(desc);
-    return data;
-  }
-  return nullptr;
-}
-void GenerateNPUProgramPass::CvtAllOpNodes(
-    const std::vector<Node*>& nodes2cvt,
-    lite::kernels::npu::bridges::node_map_type* converted_vars) {
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& cvtfunc_map = bridges.AllFunctions();
-  // return record all converted vars
-  // op node's inputs must be found in converted_vars
-  for (auto& node : nodes2cvt) {
-    lite::kernels::npu::bridges::node_map_type node_inputs;
-    auto& stmt = node->AsStmt();
-    for (auto& var_node : node->inlinks) {
-      auto& arg = var_node->AsArg();
-      // weight should be handled in the converter, so skip here
-      if (arg.is_weight) {
-        continue;
-      }
-      auto var_name = arg.name;
-      if (!converted_vars->count(var_name)) {
-        converted_vars->insert(
-            std::make_pair(var_name, CvtVarNode(var_node, stmt.op()->scope())));
-      }
-      node_inputs.insert(*converted_vars->find(var_name));
-    }
-    auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs);
-    converted_vars->insert(node_outputs.begin(), node_outputs.end());
-  }
-}
-std::string GenerateNPUProgramPass::BuildNPUGraph(
-    const std::unordered_set<Node*>& op_nodes,
-    const std::unordered_set<Node*>& in_data_vars,
-    const std::unordered_set<Node*>& out_data_vars,
-    int sub_id) {
-  auto ordered_nodes = GetTopologicalOrder(op_nodes);
-  lite::kernels::npu::bridges::node_map_type converted_vars;
-  CvtAllOpNodes(ordered_nodes, &converted_vars);
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  std::vector<ge::Operator> inputs;
-  std::vector<ge::Operator> outputs;
-  for (auto i : in_data_vars) {
-    auto argname = i->AsArg().name;
-    in_var_names.push_back(argname);
-    inputs.push_back(*converted_vars.at(argname));
-  }
-  for (auto i : out_data_vars) {
-    auto argname = i->AsArg().name;
-    out_var_names.push_back(argname);
-    outputs.push_back(*converted_vars.at(argname));
-  }
-  std::string weight_var_name = "graph" + std::to_string(sub_id) + "_weights";
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  auto weight = any_op->scope()->Var(weight_var_name)->GetMutable<Tensor>();
-  weight->set_persistable(true);
-  weight->set_precision(PRECISION(kInt8));
-  // Compiling IR graph to NPU model and store mode data into weight tensor with
-  // persistable=true, Sothat the model parser can recognize it and save it to
-  // param files
-  if (!lite::npu::BuildModel(inputs, outputs, weight)) {
-    LOG(FATAL) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")";
-  } else {
-    LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")";
-  }
-  return weight_var_name;
-}
-void GenerateNPUProgramPass::GenNPUSubgraph(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::unordered_set<Node*>& op_nodes,
-    int sub_id) {
-  std::unordered_set<Node*> in_data_vars;
-  std::unordered_set<Node*> in_wgt_vars;
-  std::unordered_set<Node*> out_data_vars;
-  std::unordered_set<Node*> out_unused_vars;
-  FindInputOutputVars(
-      op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
-  auto weight_var_name =
-      BuildNPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  InsertNewNode(graph,
-                weight_var_name,
-                any_op->scope(),
-                any_op->valid_places(),
-                in_data_vars,
-                in_wgt_vars,
-                out_data_vars,
-                out_unused_vars);
-  auto nodes2rm = GetNode2rm(
-      op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
-  GraphSafeRemoveNodes(graph.get(), nodes2rm);
-}
-void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  LOG(INFO) << "[NPU] Before NPU Pass \n" << Visualize(graph.get());
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& op_map = bridges.AllFunctions();
-  std::vector<std::string> supported_op_types;
-  for (auto& i : op_map) {
-    LOG(INFO) << "[NPU] Supported type: " << i.first;
-    supported_op_types.push_back(i.first);
-  }
-  int num_subgraph = FuseSubgraph(graph, supported_op_types);
-  InferOnce(graph);
-  auto op_nodes_all = ClassifySubgraph(graph);
-  CHECK_EQ(op_nodes_all.size(), num_subgraph);
-  int id = 1;
-  for (auto& op_nodes : op_nodes_all) {
-    LOG(INFO) << "[NPU] Converting Subgraph " << id;
-    GenNPUSubgraph(graph, op_nodes.second, id);
-    LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n"
-              << Visualize(graph.get());
-    id++;
-  }
-}
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-REGISTER_MIR_PASS(generate_npu_program_pass,
-                  paddle::lite::mir::subgraph::GenerateNPUProgramPass)
-    .BindTargets({TARGET(kNPU)});
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.cc
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher.h"
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-std::shared_ptr<xtcl::xExpr> GenerateXPUProgramPass::CvtVarNode(
-    lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
-    lite::mir::Node* var_node,
-    const Scope* scope) {
-  CHECK(var_node->IsArg());
-  const auto& arg = var_node->AsArg();
-  auto var_name = arg.name;
-  VLOG(4) << "[XPU] Convert var node " << var_name;
-  auto* var = scope->FindVar(var_name);
-  CHECK(var);
-  auto* tensor = var->GetMutable<lite::Tensor>();
-  CHECK(tensor);
-  auto dims = tensor->dims();
-  auto cvted_var_node =
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-          var_name, lite::xpu::CvtShape(dims), ::xtcl::Float(32)));
-  if (arg.is_weight) {
-    auto cvted_var_tensor = lite::xpu::CvtTensor(tensor);
-    graph_ctx->params->emplace(std::make_pair(var_name, *cvted_var_tensor));
-  }
-  return cvted_var_node;
-}
-void GenerateXPUProgramPass::CvtAllOpNodes(
-    const std::vector<Node*>& op_nodes,
-    lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
-    lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes) {
-  const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  // return record all converted vars
-  // op node's inputs must be found in converted_vars
-  for (auto& node : op_nodes) {
-    lite::kernels::xpu::bridges::node_map_type input_nodes;
-    auto& stmt = node->AsStmt();
-    for (auto& var_node : node->inlinks) {
-      auto& arg = var_node->AsArg();
-      // weight should be handled in the converter, so skip here
-      if (arg.is_weight) {
-        continue;
-      }
-      auto var_name = arg.name;
-      if (!cvted_var_nodes->count(var_name)) {
-        cvted_var_nodes->insert(std::make_pair(
-            var_name, CvtVarNode(graph_ctx, var_node, stmt.op()->scope())));
-      }
-      input_nodes.insert(*cvted_var_nodes->find(var_name));
-    }
-    auto output_nodes =
-        supported_lists.at(stmt.op_type())(stmt.op(), graph_ctx, input_nodes);
-    cvted_var_nodes->insert(output_nodes.begin(), output_nodes.end());
-  }
-}
-std::string GenerateXPUProgramPass::BuildXPUGraph(
-    const std::unordered_set<Node*>& op_nodes,
-    const std::unordered_set<Node*>& in_data_vars,
-    const std::unordered_set<Node*>& out_data_vars,
-    int sub_id) {
-  auto ordered_op_nodes = GetTopologicalOrder(op_nodes);
-  lite::kernels::xpu::bridges::graph_ctx_type graph_ctx;
-  graph_ctx.builder = std::make_shared<xtcl::network::xNetworkBuilder>();
-  graph_ctx.params =
-      std::make_shared<xtcl::network::xTensorCompiler::ParamNDArrayMap>();
-  lite::kernels::xpu::bridges::node_map_type cvted_var_nodes;
-  CvtAllOpNodes(ordered_op_nodes, &graph_ctx, &cvted_var_nodes);
-  std::string weight_var_name = "graph" + std::to_string(sub_id) + "_weights";
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  auto weight = any_op->scope()->Var(weight_var_name)->GetMutable<Tensor>();
-  weight->set_persistable(true);
-  weight->set_precision(PRECISION(kInt8));
-  // Compiling graph to XPU model and store mode data into weight tensor with
-  // persistable=true, Sothat the model parser can recognize it and save it to
-  // param files
-  std::vector<std::shared_ptr<xtcl::xExpr>> ordered_cvted_var_nodes;
-  for (auto out_data_var : out_data_vars) {
-    auto var_name = out_data_var->AsArg().name;
-    ordered_cvted_var_nodes.push_back(cvted_var_nodes[var_name]);
-  }
-  if (!lite::xpu::BuildModel(graph_ctx.builder,
-                             graph_ctx.params,
-                             &ordered_cvted_var_nodes,
-                             weight)) {
-    LOG(FATAL) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")";
-  } else {
-    LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")";
-  }
-  return weight_var_name;
-}
-void GenerateXPUProgramPass::GenXPUSubgraph(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::unordered_set<Node*>& op_nodes,
-    int sub_id) {
-  std::unordered_set<Node*> in_data_vars;
-  std::unordered_set<Node*> in_wgt_vars;
-  std::unordered_set<Node*> out_data_vars;
-  std::unordered_set<Node*> out_unused_vars;
-  FindInputOutputVars(
-      op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
-  auto weight_var_name =
-      BuildXPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  InsertNewNode(graph,
-                weight_var_name,
-                any_op->scope(),
-                any_op->valid_places(),
-                in_data_vars,
-                in_wgt_vars,
-                out_data_vars,
-                out_unused_vars);
-  auto nodes2rm = GetNode2rm(
-      op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
-  GraphSafeRemoveNodes(graph.get(), nodes2rm);
-}
-void GenerateXPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  LOG(INFO) << "[XPU] Before XPU Pass \n" << Visualize(graph.get());
-  const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
-  const auto& op_map = bridges.AllFunctions();
-  std::vector<std::string> supported_op_types;
-  for (auto& i : op_map) {
-    LOG(INFO) << "[XPU] Supported type: " << i.first;
-    supported_op_types.push_back(i.first);
-  }
-  int num_subgraph = FuseSubgraph(graph, supported_op_types);
-  InferOnce(graph);
-  auto op_nodes_all = ClassifySubgraph(graph);
-  CHECK_EQ(op_nodes_all.size(), num_subgraph);
-  int id = 1;
-  for (auto& op_nodes : op_nodes_all) {
-    LOG(INFO) << "[XPU] Converting Subgraph " << id;
-    GenXPUSubgraph(graph, op_nodes.second, id);
-    LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n"
-              << Visualize(graph.get());
-    id++;
-  }
-}
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-REGISTER_MIR_PASS(generate_xpu_program_pass,
-                  paddle::lite::mir::subgraph::GenerateXPUProgramPass)
-    .BindTargets({TARGET(kXPU)});
--- a/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <gtest/gtest.h>
-#include <cmath>
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/utils/cp_logging.h"
-DEFINE_string(model_file, "", "model file path of combined protobuf model");
-DEFINE_string(params_file, "", "params file path of combined protobuf model");
-DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model");
-DEFINE_string(input_tensor_shape, "1,3,224,224", "shapes of input tensors");
-DEFINE_int32(output_tensor_num, 1, "number of output tensors");
-namespace paddle {
-namespace lite {
-std::vector<std::vector<int64_t>> ParseShape(std::string txt) {
-  std::vector<std::vector<int64_t>> shape;
-  while (!txt.empty()) {
-    size_t idx = txt.find_first_of(":");
-    std::string dims = txt.substr(0, idx);
-    std::vector<int64_t> s;
-    while (!dims.empty()) {
-      size_t idx = dims.find_first_of(",");
-      int d = atoi(dims.substr(0, idx).c_str());
-      VLOG(3) << d;
-      s.push_back(d);
-      if (idx == std::string::npos) {
-        break;
-      } else {
-        dims = dims.substr(idx + 1);
-      }
-    }
-    shape.push_back(s);
-    if (idx == std::string::npos) {
-      break;
-    } else {
-      txt = txt.substr(idx + 1);
-    }
-  }
-  return shape;
-}
-int64_t ShapeProduction(std::vector<int64_t> shape) {
-  int64_t s = 1;
-  for (int64_t dim : shape) {
-    s *= dim;
-  }
-  return s;
-}
-void FillInputTensor(
-    const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
-    const std::vector<std::vector<int64_t>>& input_tensor_shape,
-    const float value) {
-  for (int i = 0; i < input_tensor_shape.size(); i++) {
-    auto input_tensor = predictor->GetInput(i);
-    input_tensor->Resize(input_tensor_shape[i]);
-    auto input_tensor_data = input_tensor->mutable_data<float>();
-    auto input_tensor_size = ShapeProduction(input_tensor->shape());
-    for (int j = 0; j < input_tensor_size; j++) {
-      input_tensor_data[j] = value;
-    }
-  }
-}
-void CompareOutputTensor(
-    const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor,
-    const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor,
-    const int output_tensor_num) {
-  for (int i = 0; i < output_tensor_num; i++) {
-    auto tar_output_tensor = tar_predictor->GetOutput(i);
-    auto ref_output_tensor = ref_predictor->GetOutput(i);
-    auto tar_output_tensor_data = tar_output_tensor->data<float>();
-    auto ref_output_tensor_data = ref_output_tensor->data<float>();
-    auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());
-    auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape());
-    EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size);
-    for (size_t j = 0; j < ref_output_tensor_size; j++) {
-      auto diff =
-          std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]) /
-          (std::fabs(ref_output_tensor_data[j]) + 1e-6);
-      VLOG(3) << diff;
-      EXPECT_LT(diff, 0.1);
-    }
-  }
-}
-std::shared_ptr<lite_api::PaddlePredictor> TestModel(
-    const std::string& model_dir,
-    const std::string& model_file,
-    const std::string& params_file,
-    const std::vector<lite_api::Place>& valid_places,
-    const std::vector<std::vector<int64_t>>& input_tensor_shape,
-    const std::string& optimized_model_dir) {
-  // generate optimized model
-  lite_api::CxxConfig cxx_config;
-  cxx_config.set_model_dir(model_dir);
-  cxx_config.set_model_file(model_file);
-  cxx_config.set_param_file(params_file);
-  cxx_config.set_valid_places(valid_places);
-  auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
-  FillInputTensor(predictor, input_tensor_shape, -1);
-  predictor->SaveOptimizedModel(optimized_model_dir,
-                                lite_api::LiteModelType::kNaiveBuffer);
-#if 0  // TODO(hong19860320) supports light api for XPU
-  // load optimized model
-  lite_api::MobileConfig mobile_config;
-  mobile_config.set_model_dir(optimized_model_dir);
-  mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
-  mobile_config.set_threads(1);
-  predictor = lite_api::CreatePaddlePredictor(mobile_config);
-  FillInputTensor(predictor, input_tensor_shape, 1);
-#endif
-  // run optimized model
-  for (int i = 0; i < FLAGS_warmup; i++) {
-    predictor->Run();
-  }
-  for (int i = 0; i < FLAGS_repeats; i++) {
-    auto start = GetCurrentUS();
-    predictor->Run();
-    LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
-  }
-  return predictor;
-}
-TEST(XPUSubgraph, compare) {
-  // parsing input tensor shape, supported formats: "1,3,224,224"
-  // "1,3,224,224:1,80"
-  std::vector<std::vector<int64_t>> input_tensor_shape =
-      ParseShape(FLAGS_input_tensor_shape);
-  // generate and run optimized CPU model
-  LOG(INFO) << " ================ CPU ================== ";
-  auto cpu_predictor =
-      TestModel(FLAGS_model_dir,
-                FLAGS_model_file,
-                FLAGS_params_file,
-                {lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
-                input_tensor_shape,
-                FLAGS_optimized_model_dir + "/CPU");
-  // generate and run optimized XPU model
-  LOG(INFO) << " ================ XPU ================== ";
-  auto xpu_predictor =
-      TestModel(FLAGS_model_dir,
-                FLAGS_model_file,
-                FLAGS_params_file,
-                {lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
-                 lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
-                input_tensor_shape,
-                FLAGS_optimized_model_dir + "/XPU");
-  // verify results
-  CompareOutputTensor(xpu_predictor, cpu_predictor, FLAGS_output_tensor_num);
-}
-}  // namespace lite
-}  // namespace paddle
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/subgraph/subgraph_detector.h"
+#include <memory>
+#include <set>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/dot.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher.h"
+#include "lite/operators/subgraph_op.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+using inference::analysis::Dot;
+std::string SubgraphVisualizer::operator()() {
+  inference::analysis::Dot dot;
+  const std::vector<std::string> subgraph_colors{
+      "red",          "green",          "cyan",           "bisque3",
+      "coral",        "darkseagreen1",  "goldenrod1",     "darkorchid",
+      "antiquewhite", "aquamarine",     "azure",          "bisque4",
+      "blue2",        "brown1",         "burlywood1",     "cadetblue1",
+      "chartreuse1",  "chocolate1",     "coral1",         "cornsilk",
+      "crimson",      "cyan4",          "darkgoldenrod4", "darkolivegreen2",
+      "darkorange2",  "darkorchid2",    "darkseagreen3",  "darkslategray",
+      "deeppink2",    "deepskyblue2",   "dodgerblue",     "firebrick",
+      "floralwhite",  "gold1",          "skyblue3",       "indianred",
+      "indigo",       "lavenderblush2", "lightblue1",     "lightsalmon3",
+      "khaki1",       "ivory4",         "sandybrown",     "olivedrab2",
+      "turquoise4",   "snow3",          "sienna4",        "salmon2",
+  };
+  std::unordered_map<Node *, int> subgraph_indices;
+  for (int i = 0; i < subgraphs_.size(); i++) {
+    for (int j = 0; j < subgraphs_[i].size(); j++) {
+      subgraph_indices[subgraphs_[i][j]] = i;
+    }
+  }
+  std::unordered_map<std::string, int> exists_ops;
+  std::set<std::string> exists_args;
+  for (auto &node : graph_->StmtTopologicalOrder()) {
+    if (!node->IsStmt()) {
+      continue;
+    }
+    auto op_type = node->AsStmt().op_type();
+    if (!exists_ops.count(op_type)) {
+      exists_ops[op_type] = 0;
+    } else {
+      exists_ops[op_type]++;
+    }
+    auto op_name = op_type + std::to_string(exists_ops[op_type]);
+    std::string op_color = "white";
+    if (subgraph_indices.count(node)) {
+      auto subgraph_idx = subgraph_indices[node];
+      op_name += "_subgraph_" + std::to_string(subgraph_idx);
+      op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()];
+    }
+    dot.AddNode(op_name,
+                {Dot::Attr("shape", "box"),
+                 Dot::Attr("style", "filled"),
+                 Dot::Attr("color", "black"),
+                 Dot::Attr("fillcolor", op_color)});
+    for (auto &in_node : node->inlinks) {
+      auto arg_name = in_node->AsArg().name;
+      if (!exists_args.count(arg_name)) {
+        dot.AddNode(arg_name, {});
+        exists_args.insert(arg_name);
+      }
+      dot.AddEdge(arg_name, op_name, {});
+    }
+    for (auto &out_node : node->outlinks) {
+      auto arg_name = out_node->AsArg().name;
+      if (!exists_args.count(arg_name)) {
+        dot.AddNode(arg_name, {});
+        exists_args.insert(arg_name);
+      }
+      dot.AddEdge(op_name, arg_name, {});
+    }
+  }
+  auto res = dot.Build();
+  VLOG(3) << "subgraphs: " << subgraphs_.size() << "\n" << res << std::endl;
+  return res;
+}
+// Find the ancestor node
+SubgraphDetector::node_dat_t *
+SubgraphDetector::node_dat_t::UnionFindAncestor() {
+  node_dat_t *ancestor = this;
+  while (ancestor->union_find_parent != ancestor) {
+    ancestor = ancestor->union_find_parent;
+  }
+  return ancestor;
+}
+// Merge the two adjacent nodes into one node.
+// Suppose we have two adjacent nodes src and dst.
+// We will perform the following operations:
+// 1. add all inputs(except src) of dst to src inlinks.
+// 2. add all outputs of dst to src outlinks.
+// 3. change all the dst's inputs and outputs
+// corresponding inlinks and outlinks to src node.
+// 4. delete all dst's inlinks and outlinks.
+void SubgraphDetector::node_dat_t::UnionFindCombine(node_dat_t *candidate) {
+  // Make this two node share the same ancestor.
+  union_find_parent = UnionFindAncestor();
+  node_dat_t *candidate_ancestor = candidate->UnionFindAncestor();
+  candidate_ancestor->union_find_parent = union_find_parent;
+  candidate->union_find_parent = union_find_parent;
+  // Obtain the input and output nodes for the combined one
+  std::unordered_set<node_dat_t *> inputs(inlinks.begin(), inlinks.end());
+  std::unordered_set<node_dat_t *> outputs(candidate->outlinks.begin(),
+                                           candidate->outlinks.end());
+  for (auto *out_node : outlinks) {
+    if (out_node != candidate) {
+      outputs.insert(out_node);
+    }
+  }
+  for (auto *in_node : candidate->inlinks) {
+    if (in_node != this) {
+      inputs.insert(in_node);
+    }
+  }
+// Update the dst and src node's inlinks and outlinks.
+#ifdef __clang__
+  inlinks = node_set_t(inputs.begin(), inputs.end());
+  outlinks = node_set_t(outputs.begin(), outputs.end());
+  candidate->inlinks.clear();
+  candidate->outlinks.clear();
+#else
+  inlinks = std::move(node_set_t(inputs.begin(), inputs.end()));
+  outlinks = std::move(node_set_t(outputs.begin(), outputs.end()));
+  candidate->inlinks.clear();
+  candidate->outlinks.clear();
+#endif
+  // Change all the dst inputs and outputs corresponding inlink and
+  // outlink to the src node.
+  for (auto *in_node : inlinks) {
+    for (auto *&out_node : in_node->outlinks) {
+      if (out_node == candidate) {
+        out_node = this;
+      }
+    }
+  }
+  for (auto *out_node : outlinks) {
+    for (auto *&in_node : out_node->inlinks) {
+      if (in_node == candidate) {
+        in_node = this;
+      }
+    }
+  }
+}
+// FlexibleDFS
+// If reverse is true, do reverse dfs.
+// If enter func is not nullptr, calls enter(node) before visiting any children
+// of node.
+// If leave func not nullptr, calls leave(node) after visiting all parents of
+// node.
+void SubgraphDetector::FlexibleDFS(
+    const node_set_t &source,
+    bool reverse,
+    const std::function<bool(const node_dat_t *)> &enter,
+    const std::function<bool(const node_dat_t *)> &leave) {
+  std::vector<std::pair<const node_dat_t *, bool>> stack;  // node, leave
+  for (auto &node : source) {
+    stack.push_back(std::pair<const node_dat_t *, bool>(node, false));
+  }
+  std::unordered_set<const node_dat_t *> visited;
+  while (!stack.empty()) {
+    auto top = stack.back();
+    stack.pop_back();
+    if (top.second) {
+      if (leave && !leave(top.first)) return;
+    }
+    if (visited.count(top.first)) continue;
+    visited.insert(top.first);
+    if (enter && !enter(top.first)) return;
+    if (leave)
+      stack.push_back(std::pair<const node_dat_t *, bool>(top.first, true));
+    const node_set_t iter_nodes =
+        reverse == true ? top.first->inlinks : top.first->outlinks;
+    for (auto *node : iter_nodes) {
+      if (!visited.count(node)) {
+        stack.push_back(std::pair<const node_dat_t *, bool>(node, false));
+      }
+    }
+  }
+}
+void SubgraphDetector::InitNodes(node_map_t *nodes) {
+  // Initialize and mark the subgraph detector nodes based on teller.
+  for (auto &it : *nodes) {
+    for (auto &in_node : it.first->inlinks) {
+      it.second->inlinks.push_back((*nodes)[in_node]);
+    }
+    for (auto &out_node : it.first->outlinks) {
+      it.second->outlinks.push_back((*nodes)[out_node]);
+    }
+    if (teller_(it.first)) {
+      it.second->marked = true;
+      if (it.first->IsStmt()) {
+        // If a function is inside the subgraph, mark all the output variables
+        // to be inside too, so that two marked functions will be inside a same
+        // subgraph, lets take a example:  A_function->var->B_function, if
+        // A_function is marked, var should also be marked, so that B_function
+        // will be in the same subgraph with A_function if B_function is
+        // marked.
+        for (auto &out_node : it.first->outlinks) {
+          (*nodes)[out_node]->marked = true;
+        }
+      }
+    }
+  }
+}  // namespace mir
+std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
+    node_map_t *nodes) {
+  for (auto &it : *nodes) {
+    node_dat_t *node = it.second;
+    if (!node->marked) {
+      continue;
+    }
+    //  Our algorithm must guarantee that:
+    //  1. The graph is always directed acyclic graph（DAG）.
+    //  2. If there is a path in the subgraph from X to Y (X and Y are both
+    //  nodes in the subgraph), then all paths from X to Y are in the
+    //  subgraph.
+    //
+    //  In order to achieve the above guarantee.
+    //  For adjacent nodes src -> dst.
+    //  1. Get all dst input nodes except src.
+    //  2. Reverse DFS from those input nodes
+    //  3. If there is a path from input nodes to src,
+    //  then the src and dst nodes can not be fused into one node,
+    //  otherwise it can be done.
+    while (true) {
+      std::unordered_set<node_dat_t *> contract_nodes;
+      for (auto *out_node : node->outlinks) {
+        // must be an candidate
+        if (!out_node->marked) continue;
+        // get all dst input nodes except src node.
+        node_set_t source_nodes;
+        for (auto *in_node : out_node->inlinks) {
+          if (in_node != node) {
+            source_nodes.push_back(in_node);
+          }
+        }
+        // Reverse DFS from the source_nodes.
+        bool have_excess_path = false;
+        FlexibleDFS(source_nodes,
+                    true,
+                    nullptr,
+                    [&have_excess_path, node](const node_dat_t *n) {
+                      if (n == node) {
+                        have_excess_path = true;
+                        return false;
+                      }
+                      return true;
+                    });
+        if (have_excess_path) continue;
+        contract_nodes.insert(out_node);
+      }
+      if (contract_nodes.empty()) break;
+      for (auto &contract_node : contract_nodes) {
+        node->UnionFindCombine(contract_node);
+      }
+    }
+  }
+  std::unordered_map<node_dat_t * /*ancestor*/, std::vector<Node *>> clusters;
+  for (auto &node : graph_->StmtTopologicalOrder()) {
+    if (!node->IsStmt()) continue;
+    if ((*nodes)[node]->marked) {
+      clusters[(*nodes)[node]->UnionFindAncestor()].push_back(node);
+    }
+  }
+  std::vector<std::vector<Node *>> subgraphs;
+  std::for_each(clusters.begin(),
+                clusters.end(),
+                [&](const decltype(clusters)::value_type &it) {
+                  subgraphs.push_back(it.second);
+                });
+  return subgraphs;
+}
+std::vector<std::vector<Node *>> SubgraphDetector::operator()() {
+  node_map_t nodes;
+  for (auto &node : graph_->mutable_nodes()) {
+    nodes[&node] = new node_dat_t(&node);
+    CHECK(nodes[&node]);
+  }
+  // Initialize and mark the subgraph detector nodes based on teller.
+  InitNodes(&nodes);
+  // Run the Extract algorithm to find all subgraphs.
+  std::vector<std::vector<Node *>> subgraphs = ExtractSubgraphs(&nodes);
+  for (auto &it : nodes) {
+    CHECK(it.second);
+    delete it.second;
+  }
+  return subgraphs;
+}
+void SubgraphFuser::InsertNewNode(SSAGraph *graph,
+                                  int subgraph_idx,
+                                  const std::vector<Node *> &subgraph_nodes) {
+  // Create and attach a new subgraph op
+  cpp::OpDesc subgraph_op_desc;
+  subgraph_op_desc.SetType("subgraph");
+  // Create a new sub block desc for storing all of Ops an Vars of the target
+  // subgraph and sub_block_idx is set as a attribute of subgraph op,
+  // sub_block_idx < 0 means it's a new subgraph op
+  int sub_block_idx = -(subgraph_idx + 1);
+  auto sub_block_desc = new cpp::BlockDesc();
+  sub_block_desc->ClearOps();
+  sub_block_desc->ClearVars();
+  for (auto &op_node : subgraph_nodes) {
+    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_block_op_desc = *op_node->AsStmt().op_info();
+    sub_block_op_desc->SetAttr(
+        kKernelTypeAttr,
+        op_node->AsStmt().picked_kernel().SerializedKernelType());
+  }
+  subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);
+  // Extract input and output nodes from the target subgraph
+  std::unordered_set<Node *> input_var_nodes;
+  std::unordered_set<Node *> weight_var_nodes;
+  std::unordered_set<Node *> output_var_nodes;
+  std::unordered_set<Node *> local_var_nodes;
+  std::unordered_set<Node *> unused_var_nodes;
+  ExtractInputsOutputs(subgraph_nodes,
+                       &input_var_nodes,
+                       &weight_var_nodes,
+                       &output_var_nodes,
+                       &local_var_nodes,
+                       &unused_var_nodes);
+  // Set input and output name mapping which stores the real inputs and
+  // outputs
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (auto &var_node : input_var_nodes) {
+    input_var_names.push_back(var_node->AsArg().name);
+  }
+  for (auto &var_node : output_var_nodes) {
+    output_var_names.push_back(var_node->AsArg().name);
+  }
+  subgraph_op_desc.SetAttr<std::vector<std::string>>("input_data_names",
+                                                     input_var_names);
+  subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
+                                                     output_var_names);
+  // Set all of the inputs and outputs to the target subgraph op
+  // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
+  for (auto &var_node : weight_var_nodes) {
+    input_var_names.push_back(var_node->AsArg().name);
+  }
+  for (auto &var_node : local_var_nodes) {
+    output_var_names.push_back(var_node->AsArg().name);
+  }
+  for (auto &var_node : unused_var_nodes) {
+    output_var_names.push_back(var_node->AsArg().name);
+  }
+  subgraph_op_desc.SetInput("Inputs", input_var_names);
+  subgraph_op_desc.SetOutput("Outputs", output_var_names);
+  auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+  static_cast<operators::SubgraphOp *>(subgraph_op.get())
+      ->SetSubBlock(sub_block_desc);
+  auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
+  subgraph_op->Attach(subgraph_op_desc, any_op->scope());
+  // Create and add a new subgraph node into the graph
+  auto subgraph_op_node =
+      graph->GraphCreateInstructNode(subgraph_op, any_op->valid_places());
+  for (auto &var_node : input_var_nodes) {
+    IR_NODE_LINK_TO(var_node, subgraph_op_node);
+  }
+  for (auto &var_node : weight_var_nodes) {
+    IR_NODE_LINK_TO(var_node, subgraph_op_node);
+  }
+  for (auto &var_node : output_var_nodes) {
+    IR_OP_VAR_LINK(subgraph_op_node, var_node);
+  }
+  for (auto &var_node : local_var_nodes) {
+    IR_OP_VAR_LINK(subgraph_op_node, var_node);
+  }
+  for (auto &var_node : unused_var_nodes) {
+    IR_OP_VAR_LINK(subgraph_op_node, var_node);
+  }
+  // Create and assign the context to the picked kernel of the new subgraph
+  // node
+  auto &inst = subgraph_op_node->AsStmt();
+  inst.picked_kernel().SetContext(
+      ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
+  // Remove subgraph nodes and unused var nodes
+  auto nodes2rm = GetNodes2RM(subgraph_nodes,
+                              {input_var_nodes,
+                               weight_var_nodes,
+                               output_var_nodes,
+                               local_var_nodes,
+                               unused_var_nodes});
+  GraphSafeRemoveNodes(graph, nodes2rm);
+}
+void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph,
+                                              const SubgraphTeller &teller,
+                                              int min_subgraph_size) {
+  std::vector<std::vector<Node *>> subgraphs =
+      SubgraphDetector(graph, teller)();
+  SubgraphVisualizer(graph, subgraphs)();
+  for (int subgraph_idx = 0; subgraph_idx < subgraphs.size(); subgraph_idx++) {
+    if (subgraphs[subgraph_idx].size() >= min_subgraph_size) {
+      InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]);
+    }
+  }
+}
+void SubgraphFuser::operator()() {
+  ReplaceNodesWithSubgraphs(graph_, teller_, min_subgraph_size_);
+}
+void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,
+                          std::unordered_set<Node *> *input_var_nodes,
+                          std::unordered_set<Node *> *weight_var_nodes,
+                          std::unordered_set<Node *> *output_var_nodes,
+                          std::unordered_set<Node *> *local_var_nodes,
+                          std::unordered_set<Node *> *unused_var_nodes) {
+  for (auto &op_node : op_nodes) {
+    for (auto &var_node : op_node->inlinks) {
+      if (var_node->AsArg().is_weight) {
+        weight_var_nodes->insert(var_node);
+        continue;
+      }
+      if (!var_node->inlinks.empty()) {
+        // Var can only come from one op node, so use front
+        auto *prev_op_node = var_node->inlinks.front();
+        if (std::find(op_nodes.begin(), op_nodes.end(), prev_op_node) !=
+            op_nodes.end()) {
+          continue;
+        }
+      }
+      input_var_nodes->insert(var_node);
+    }
+    for (auto &var_node : op_node->outlinks) {
+      if (var_node->outlinks.empty()) {
+        // The next op is empty so this var is actually unused
+        unused_var_nodes->insert(var_node);
+        continue;
+      }
+      // Var can have more than one next op node, So, if any one in the
+      // op_nodes then continue
+      bool next_op_in_nodes = false;
+      for (auto &next_op_node : var_node->outlinks) {
+        if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) !=
+            op_nodes.end()) {
+          next_op_in_nodes = true;
+        }
+      }
+      if (next_op_in_nodes) {
+        local_var_nodes->insert(var_node);
+        continue;
+      }
+      output_var_nodes->insert(var_node);
+    }
+  }
+}
+std::unordered_set<const Node *> GetNodes2RM(
+    const std::vector<Node *> &op_nodes,
+    const std::vector<std::unordered_set<Node *>> &excluded_var_nodes) {
+  std::unordered_set<const Node *> nodes2rm(op_nodes.begin(), op_nodes.end());
+  for (auto &op_node : op_nodes) {
+    for (auto &var_node : op_node->inlinks) {
+      if (!nodes2rm.count(var_node)) {
+        nodes2rm.insert(var_node);
+      }
+    }
+    for (auto &var_node : op_node->outlinks) {
+      if (!nodes2rm.count(var_node)) {
+        nodes2rm.insert(var_node);
+      }
+    }
+  }
+  // Excluded nodes should not be removed
+  for (auto &excluded_var_node : excluded_var_nodes) {
+    for (auto &var_node : excluded_var_node) {
+      if (nodes2rm.count(var_node)) {
+        nodes2rm.erase(var_node);
+      }
+    }
+  }
+  return nodes2rm;
+}
+static void SortHelper(Node *node,
+                       const std::unordered_set<Node *> &unordered_nodes,
+                       std::unordered_set<const Node *> *visited_nodes,
+                       std::vector<Node *> *ordered_nodes) {
+  for (auto &var_node : node->inlinks) {
+    if (var_node->inlinks.empty()) continue;
+    auto *op_node = var_node->inlinks.front();
+    if (unordered_nodes.count(op_node) && !visited_nodes->count(op_node)) {
+      SortHelper(op_node, unordered_nodes, visited_nodes, ordered_nodes);
+    }
+  }
+  ordered_nodes->push_back(node);
+  visited_nodes->insert(node);
+}
+std::vector<Node *> GetTopologicalOrder(
+    const std::unordered_set<Node *> &unordered_nodes) {
+  std::unordered_set<const Node *> visited_nodes;
+  std::vector<Node *> ordered_nodes;
+  for (auto &node : unordered_nodes) {
+    if (!node->IsStmt()) continue;
+    if (visited_nodes.count(node)) continue;
+    SortHelper(node, unordered_nodes, &visited_nodes, &ordered_nodes);
+  }
+  return ordered_nodes;
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/subgraph/subgraph_detector.h
+++ b/lite/core/mir/subgraph/subgraph_detector.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "lite/core/mir/pass.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+using SubgraphTeller = std::function<bool(Node*)>;
+class SubgraphVisualizer {
+ public:
+  SubgraphVisualizer(SSAGraph* graph,
+                     const std::vector<std::vector<Node*>>& subgraphs)
+      : graph_(graph), subgraphs_(subgraphs) {}
+  std::string operator()();
+ protected:
+  SSAGraph* graph_{nullptr};
+  std::vector<std::vector<Node*>> subgraphs_;
+};
+/*
+ * Divide the graph into subgraphs according to the specified conditions.
+ * Return the divided clusters, a cluster is consisted of the op nodes in the
+ * subgraph.
+ */
+class SubgraphDetector {
+ public:
+  // This is a simple representation of a graph. The SDNode hold the
+  // pointer of the Node. This is to avoid changing the original graph in the
+  // process of graph analysis.
+  struct node_dat_t;
+  using node_map_t = std::unordered_map<Node*, node_dat_t*>;
+  using node_set_t = std::vector<node_dat_t*>;
+  struct node_dat_t {
+    explicit node_dat_t(Node* _node) : node(_node) {}
+    Node* node;
+    bool marked{false};
+    node_dat_t* union_find_parent{this};
+    node_set_t inlinks{};
+    node_set_t outlinks{};
+    node_dat_t* UnionFindAncestor();
+    void UnionFindCombine(node_dat_t* candidate);
+  };
+  SubgraphDetector(SSAGraph* graph, const SubgraphTeller& teller)
+      : graph_(graph), teller_(teller) {}
+  std::vector<std::vector<Node*>> operator()();
+  void FlexibleDFS(const node_set_t& source,
+                   bool reverse,
+                   const std::function<bool(const node_dat_t*)>& enter,
+                   const std::function<bool(const node_dat_t*)>& leave);
+  void InitNodes(node_map_t* nodes);
+  std::vector<std::vector<Node*>> ExtractSubgraphs(node_map_t* nodes);
+ protected:
+  SSAGraph* graph_{nullptr};
+  SubgraphTeller teller_;
+};
+/*
+ * Replace all of subgraphs with the subgraph ops, a block desc is added into
+ * the subgraph op to wrap the original op nodes, keep all of var nodes of the
+ * original ops nodes as the inputs and outputs of the subgraph op
+ */
+class SubgraphFuser {
+ public:
+  SubgraphFuser(SSAGraph* graph,
+                const SubgraphTeller& teller,
+                int min_subgraph_size)
+      : graph_(graph), teller_(teller), min_subgraph_size_{min_subgraph_size} {}
+  void operator()();
+  // Remove the op nodes of the subgraphs and replace with the subgraph ops.
+  void ReplaceNodesWithSubgraphs(SSAGraph* graph,
+                                 const SubgraphTeller& teller,
+                                 int min_subgraph_size);
+  // Create a subgraph node with a block desc to wrap the original op nodes of
+  // the subgraph
+  void InsertNewNode(SSAGraph* graph,
+                     int subgraph_idx,
+                     const std::vector<Node*>& subgraph_nodes);
+ protected:
+  SSAGraph* graph_{nullptr};
+  SubgraphTeller teller_;
+  int min_subgraph_size_;
+};
+void ExtractInputsOutputs(const std::vector<Node*>& op_nodes,
+                          std::unordered_set<Node*>* input_var_nodes,
+                          std::unordered_set<Node*>* weight_var_nodes,
+                          std::unordered_set<Node*>* output_var_nodes,
+                          std::unordered_set<Node*>* local_var_nodes,
+                          std::unordered_set<Node*>* unused_var_nodes);
+std::unordered_set<const Node*> GetNodes2RM(
+    const std::vector<Node*>& op_nodes,
+    const std::vector<std::unordered_set<Node*>>& excluded_var_nodes);
+std::vector<Node*> GetTopologicalOrder(
+    const std::unordered_set<Node*>& unordered_nodes);
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/subgraph/subgraph_program_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_program_pass_test.cc
@@ -12,68 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
+#include "lite/core/mir/subgraph/subgraph_detector.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
-#include "lite/core/mir/graph_visualize_pass.h"
 #include "lite/core/mir/ssa_graph.h"
 #include "lite/core/program.h"
 #include "lite/model_parser/cpp/program_desc.h"
 #include "lite/model_parser/model_parser.h"
 DEFINE_string(model_dir, "", "model_dir");
+DEFINE_string(model_file, "", "model file path of combined protobuf model");
+DEFINE_string(params_file, "", "params file path of combined protobuf model");
 namespace paddle {
 namespace lite {
-TEST(SubgraphTest, models) {
+// The helper functions for building model manually
-  cpp::ProgramDesc program_desc;
-  auto scope = std::make_shared<Scope>();
-  // LoadModelPb(FLAGS_model_dir,
-  //             FLAGS_model_dir + "/model",
-  //             FLAGS_model_dir + "/params",
-  //             scope.get(),
-  //             &program_desc,
-  //             true);
-  LoadModelPb(FLAGS_model_dir, "", "", scope.get(), &program_desc);
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-#ifdef LITE_WITH_ARM
-      Place{TARGET(kARM), PRECISION(kFloat)},
-#endif
-#ifdef LITE_WITH_NPU
-      Place{TARGET(kNPU), PRECISION(kFloat)},
-#endif
-#ifdef LITE_WITH_XPU
-      Place{TARGET(kXPU), PRECISION(kFloat)},
-#endif
-  });
-  lite::Program program(program_desc, scope, valid_places);
-  auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
-  graph->Build(program, valid_places);
-  std::vector<std::string> supported_op_types{"concat",
-                                              "conv2d",
-                                              "depthwise_conv2d",
-                                              "batch_norm",
-                                              "scale",
-                                              "pool2d",
-                                              "mul",
-                                              "elementwise_add",
-                                              "softmax",
-                                              "split",
-                                              "relu",
-                                              "reshape2",
-                                              "transpose2"};
-  auto* pass = new mir::subgraph::SubgraphProgramPass;
-  ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1);
-  LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get());
-}
-// return output_var_names
 std::vector<std::string> AddFCDesc(
    cpp::BlockDesc* block_desc,
    const std::shared_ptr<Scope>& scope,
@@ -87,20 +44,20 @@ std::vector<std::string> AddFCDesc(
  auto* wgt = block_desc->AddVar<cpp::VarDesc>();
  wgt->SetName(prefix + "_W");
-  auto* wtensor = scope->Var(prefix + "_W")->GetMutable<lite::Tensor>();
+  auto* wtensor = scope->Var(prefix + "_W")->GetMutable<Tensor>();
  wtensor->Resize(wshape);
  wtensor->mutable_data<float>();
  auto* bias = block_desc->AddVar<cpp::VarDesc>();
  bias->SetName(prefix + "_Bias");
-  auto* btensor = scope->Var(prefix + "_Bias")->GetMutable<lite::Tensor>();
+  auto* btensor = scope->Var(prefix + "_Bias")->GetMutable<Tensor>();
  btensor->Resize({wshape[1]});
  btensor->mutable_data<float>();
  auto* out = block_desc->AddVar<cpp::VarDesc>();
  out->SetName(prefix + "_Out");
  std::vector<std::string> out_var_names{prefix + "_Out"};
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
  op_desc->SetType("fc");
  op_desc->SetInput("Input", input_var_names);
@@ -126,7 +83,7 @@ std::vector<std::string> AddElementwiseAddDesc(
  out->SetName(prefix + "_Out");
  std::vector<std::string> out_var_names{prefix + "_Out"};
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
  op_desc->SetType("elementwise_add");
  op_desc->SetInput("X", input_X_names);
@@ -150,7 +107,7 @@ std::vector<std::string> AddFeedDesc(
  out->SetName(prefix + "_Out");
  std::vector<std::string> out_var_names{prefix + "_Out"};
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
  op_desc->SetType("feed");
  op_desc->SetInput("X", input_X_names);
@@ -173,7 +130,7 @@ std::vector<std::string> AddFetchDesc(
  out->SetName(prefix + "_Out");
  std::vector<std::string> out_var_names{prefix + "_Out"};
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
  op_desc->SetType("fetch");
  op_desc->SetInput("X", input_X_names);
@@ -183,40 +140,88 @@ std::vector<std::string> AddFetchDesc(
  return out_var_names;
 }
-std::unique_ptr<mir::SSAGraph> BuildSimpleNet(
+TEST(Subgraph, detect_simple_model) {
-    cpp::ProgramDesc* program_desc,
+  cpp::ProgramDesc program_desc;
-    const std::shared_ptr<Scope>& scope,
+  std::vector<Place> valid_places{{TARGET(kHost), PRECISION(kFloat)}};
-    const std::vector<Place>& valid_places) {
+  auto scope = std::make_shared<Scope>();
-  program_desc->ClearBlocks();
+  // Build a simple network
-  auto* block_desc = program_desc->AddBlock<cpp::BlockDesc>();
+  program_desc.ClearBlocks();
+  auto* block_desc = program_desc.AddBlock<cpp::BlockDesc>();
  block_desc->ClearOps();
  block_desc->ClearVars();
  auto* var_desc = block_desc->AddVar<cpp::VarDesc>();
  var_desc->SetName("feed_var");
-  auto* feed_var = scope->Var("feed_var")->GetMutable<lite::Tensor>();
+  auto* feed_var = scope->Var("feed_var")->GetMutable<Tensor>();
  feed_var->Resize({1, 4});
  auto fc1_out = AddFCDesc(block_desc, scope, {"feed_var"}, {4, 5});
  auto fc2_out = AddFCDesc(block_desc, scope, fc1_out, {5, 2});
+  Program program(program_desc, scope, valid_places);
-  lite::Program program(*program_desc, scope, valid_places);
  auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
  graph->Build(program, valid_places);
+  // Apply subgraph detector and check results
-  return graph;
+  auto teller = [](mir::Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    auto op_type = stmt.op_type();
+    const std::vector<std::string> supported_types = {"fc"};
+    return std::find(supported_types.begin(), supported_types.end(), op_type) !=
+           supported_types.end();
+  };
+  std::vector<std::vector<mir::Node*>> subgraphs =
+      mir::SubgraphDetector(graph.get(), teller)();
+  ASSERT_EQ(subgraphs.size(), 1);
+  ASSERT_EQ(graph->nodes().size(), 9);
+  mir::SubgraphVisualizer(graph.get(), subgraphs)();
 }
-TEST(SubGraphTest, SimpleNet) {
+TEST(Subgraph, detect_custom_model) {
+  if (FLAGS_model_dir.empty() && FLAGS_model_file.empty() &&
+      FLAGS_params_file.empty()) {
+    LOG(INFO) << "Using --model_dir, or --model_file and --params_file to set "
+                 "the path of model files.";
+    return;
+  }
  cpp::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
  auto scope = std::make_shared<Scope>();
-  auto graph = BuildSimpleNet(&program_desc, scope, places);
+  LoadModelPb(FLAGS_model_dir,
+              FLAGS_model_file,
-  std::vector<std::string> supported_op_types{"fc"};
+              FLAGS_params_file,
-  auto* pass = new mir::subgraph::SubgraphProgramPass;
+              scope.get(),
-  ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1);
+              &program_desc,
+              !FLAGS_model_file.empty() && !FLAGS_params_file.empty(),
-  ASSERT_EQ(graph->nodes().size(), 9);
+              false);
-  // LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get());
+  std::vector<Place> valid_places({
+#ifdef LITE_WITH_ARM
+      Place{TARGET(kARM), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_X86
+      Place{TARGET(kX86), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_NPU
+      Place{TARGET(kNPU), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_XPU
+      Place{TARGET(kXPU), PRECISION(kFloat)},
+#endif
+  });
+  Program program(program_desc, scope, valid_places);
+  auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
+  graph->Build(program, valid_places);
+  // Apply subgraph detector and check results
+  auto teller = [](mir::Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    auto op_type = stmt.op_type();
+    const std::vector<std::string> unsupported_types = {
+        "feed", "fetch", "subgraph"};
+    return std::find(unsupported_types.begin(),
+                     unsupported_types.end(),
+                     op_type) == unsupported_types.end();
+  };
+  std::vector<std::vector<mir::Node*>> subgraphs =
+      mir::SubgraphDetector(graph.get(), teller)();
+  ASSERT_EQ(subgraphs.size(), 1);
+  mir::SubgraphVisualizer(graph.get(), subgraphs)();
 }
 }  // namespace lite

--- a/lite/backends/xpu/runtime.h
+++ b/lite/backends/xpu/runtime.h
@@ -12,58 +12,52 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#pragma once
+#include "lite/core/mir/subgraph/subgraph_pass.h"
-#include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
-#include <unordered_map>
+#include <unordered_set>
 #include <utility>
-#include "lite/core/tensor.h"
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/subgraph/subgraph_detector.h"
 namespace paddle {
 namespace lite {
-namespace xpu {
+namespace mir {
-class DeviceInfo {
+void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
- public:
+  std::unordered_set<std::string> supported_lists;
-  static DeviceInfo& Global() {
+#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
-    static DeviceInfo x;
+#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
-    return x;
+#undef USE_SUBGRAPH_BRIDGE
-  }
+  auto teller = [&](Node* node) {
-  DeviceInfo() {}
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
-  void Insert(const std::string& name,
+    return supported_lists.count(stmt.op_type()) != 0;
-              std::shared_ptr<xtcl::network::xRuntimeInstance> runtime) {
+  };
-    if (runtimes_.find(name) != runtimes_.end()) {
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
-      LOG(WARNING) << "[XPU] Model " << name << " already exists.";
+  fuser();
-      return;
+}
-    }
-    runtimes_.emplace(std::make_pair(name, runtime));
+void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  }
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
-  void Clear() { runtimes_.clear(); }
+#include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
-  std::shared_ptr<xtcl::network::xRuntimeInstance> Find(
+  auto teller = [&](Node* node) {
-      const std::string& name) const {
+    if (!node->IsStmt()) return false;
-    if (runtimes_.find(name) != runtimes_.end()) {
+    auto& stmt = node->AsStmt();
-      return runtimes_.at(name);
+    return supported_lists.count(stmt.op_type()) != 0;
-    } else {
+  };
-      return nullptr;
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
-    }
+  fuser();
-  }
+}
- private:
+}  // namespace mir
-  int device_id_{0};
-  std::string device_name_{"default"};
-  std::unordered_map<std::string,
-                     std::shared_ptr<xtcl::network::xRuntimeInstance>>
-      runtimes_;
-};
-bool LoadModel(const lite::Tensor& model,
-               std::shared_ptr<xtcl::network::xRuntimeInstance>* runtime);
-}  // namespace xpu
 }  // namespace lite
 }  // namespace paddle
+REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
+    .BindTargets({TARGET(kNPU)});
+REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
+    .BindTargets({TARGET(kXPU)});
--- a/lite/kernels/xpu/bridges/registry.cc
+++ b/lite/kernels/xpu/bridges/registry.cc
@@ -12,30 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/kernels/xpu/bridges/registry.h"
+#pragma once
-#include <utility>
+#include <memory>
+#include <vector>
+#include "lite/core/mir/pass.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace mir {
-namespace xpu {
-namespace bridges {
-Factory& Factory::Instance() {
-  static Factory g_xpu_bridge;
-  return g_xpu_bridge;
-}
-bool Factory::HasType(const std::string& op_type) const {
+class NPUSubgraphPass : public ProgramPass {
-  return map_.count(op_type);
+ public:
-}
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
-void Factory::Insert(const std::string& op_type, const func_type& func_name) {
+class XPUSubgraphPass : public ProgramPass {
-  map_.insert(std::make_pair(op_type, func_name));
+ public:
-}
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
-}  // namespace bridges
+}  // namespace mir
-}  // namespace xpu
-}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
@@ -30,7 +30,9 @@ DEFINE_int32(output_tensor_num, 1, "number of output tensors");
 namespace paddle {
 namespace lite {
-std::vector<std::vector<int64_t>> ParseShape(std::string txt) {
+// The helper functions for loading and running model from command line and
+// verifying output data
+std::vector<std::vector<int64_t>> ShapeParsing(std::string txt) {
  std::vector<std::vector<int64_t>> shape;
  while (!txt.empty()) {
    size_t idx = txt.find_first_of(":");
@@ -65,7 +67,7 @@ int64_t ShapeProduction(std::vector<int64_t> shape) {
  return s;
 }
-void FillInputTensor(
+void FillInputTensors(
    const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
    const std::vector<std::vector<int64_t>>& input_tensor_shape,
    const float value) {
@@ -80,7 +82,7 @@ void FillInputTensor(
  }
 }
-void CompareOutputTensor(
+void CheckOutputTensors(
    const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor,
    const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor,
    const int output_tensor_num) {
@@ -96,7 +98,7 @@ void CompareOutputTensor(
      auto abs_diff =
          std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]);
      auto rel_diff = abs_diff / (std::fabs(ref_output_tensor_data[j]) + 1e-6);
-      VLOG(3) << "val: " << tar_output_tensor_data[j]
+      VLOG(5) << "val: " << tar_output_tensor_data[j]
              << " ref: " << ref_output_tensor_data[j]
              << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff;
      EXPECT_LT(rel_diff, 0.1);
@@ -111,24 +113,23 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
    const std::vector<lite_api::Place>& valid_places,
    const std::vector<std::vector<int64_t>>& input_tensor_shape,
    const std::string& optimized_model_dir) {
-  // generate optimized model
+  // Generate optimized model
  lite_api::CxxConfig cxx_config;
  cxx_config.set_model_dir(model_dir);
  cxx_config.set_model_file(model_file);
  cxx_config.set_param_file(params_file);
  cxx_config.set_valid_places(valid_places);
  auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
-  FillInputTensor(predictor, input_tensor_shape, 1);
  predictor->SaveOptimizedModel(optimized_model_dir,
                                lite_api::LiteModelType::kNaiveBuffer);
-  // load optimized model
+  // Load optimized model
  lite_api::MobileConfig mobile_config;
  mobile_config.set_model_dir(optimized_model_dir);
  mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
  mobile_config.set_threads(1);
  predictor = lite_api::CreatePaddlePredictor(mobile_config);
-  FillInputTensor(predictor, input_tensor_shape, 1);
+  FillInputTensors(predictor, input_tensor_shape, 1);
-  // run optimized model
+  // Run optimized model
  for (int i = 0; i < FLAGS_warmup; i++) {
    predictor->Run();
  }
@@ -140,32 +141,48 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
  return predictor;
 }
-TEST(NPUSubgraph, compare) {
+TEST(Subgraph, generate_model_and_check_precision) {
-  // parsing input tensor shape, supported formats: "1,3,224,224"
+  if (FLAGS_model_dir.empty() && FLAGS_model_file.empty() &&
-  // "1,3,224,224:1,80"
+      FLAGS_params_file.empty()) {
+    LOG(INFO) << "Using --model_dir, or --model_file and --params_file to set "
+                 "the path of model files.";
+    return;
+  }
+  // Parsing the shapes of input tensors from strings, supported formats:
+  // "1,3,224,224" and "1,3,224,224:1,80"
  std::vector<std::vector<int64_t>> input_tensor_shape =
-      ParseShape(FLAGS_input_tensor_shape);
+      ShapeParsing(FLAGS_input_tensor_shape);
-  // generate and run optimized CPU model
+  std::vector<lite_api::Place> valid_places({
-  LOG(INFO) << " ================ CPU ================== ";
+#ifdef LITE_WITH_ARM
-  auto cpu_predictor =
+      lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
-      TestModel(FLAGS_model_dir,
+#endif
-                FLAGS_model_file,
+#ifdef LITE_WITH_X86
-                FLAGS_params_file,
+      lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
-                {lite_api::Place{TARGET(kARM), PRECISION(kFloat)}},
+#endif
-                input_tensor_shape,
+  });
-                FLAGS_optimized_model_dir + "/CPU");
+  // Generate and run optimized model on CPU as the reference predictor
-  // generate and run optimized NPU model
+  auto ref_predictor = TestModel(FLAGS_model_dir,
-  LOG(INFO) << " ================ NPU ================== ";
+                                 FLAGS_model_file,
-  auto npu_predictor =
+                                 FLAGS_params_file,
-      TestModel(FLAGS_model_dir,
+                                 valid_places,
-                FLAGS_model_file,
+                                 input_tensor_shape,
-                FLAGS_params_file,
+                                 FLAGS_optimized_model_dir + "/ref_opt_model");
-                {lite_api::Place{TARGET(kNPU), PRECISION(kFloat)},
+// Generate and run optimized model on NPU/XPU as the target predictor
-                 lite_api::Place{TARGET(kARM), PRECISION(kFloat)}},
+#ifdef LITE_WITH_NPU
-                input_tensor_shape,
+  valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
-                FLAGS_optimized_model_dir + "/NPU");
+#endif
-  // verify results
+#ifdef LITE_WITH_XPU
-  CompareOutputTensor(npu_predictor, cpu_predictor, FLAGS_output_tensor_num);
+  valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
+#endif
+  auto tar_predictor = TestModel(FLAGS_model_dir,
+                                 FLAGS_model_file,
+                                 FLAGS_params_file,
+                                 valid_places,
+                                 input_tensor_shape,
+                                 FLAGS_optimized_model_dir + "/tar_opt_model");
+  // Check the difference of the output tensors between reference predictor and
+  // target predictor
+  CheckOutputTensors(tar_predictor, ref_predictor, FLAGS_output_tensor_num);
 }
 }  // namespace lite

--- a/lite/core/mir/subgraph/subgraph_program_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_program_pass.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
-#include <memory>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher.h"
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-std::unordered_map<int, std::unordered_set<Node*>>
-SubgraphProgramPass::ClassifySubgraph(const std::unique_ptr<SSAGraph>& graph) {
-  std::unordered_map<int, std::unordered_set<Node*>> op_nodes;
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    int sub_id = stmt.subgraph_id();
-    if (sub_id < 1) continue;
-    if (!op_nodes.count(sub_id)) {
-      op_nodes[sub_id] = std::unordered_set<Node*>();
-    }
-    op_nodes.at(sub_id).insert(item);
-  }
-  return op_nodes;
-}
-cpp::OpDesc SubgraphProgramPass::GenGraphOpDesc(
-    const std::string& weight_var_name,
-    const std::vector<std::string>& in_var_names,
-    const std::vector<std::string>& out_var_names) {
-  cpp::OpDesc op_desc;
-  op_desc.SetType("graph_op");
-  op_desc.SetInput("Inputs", in_var_names);
-  op_desc.SetInput("Weight", {weight_var_name});
-  op_desc.SetOutput("Outputs", out_var_names);
-  return op_desc;
-}
-void SubgraphProgramPass::InsertNewNode(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::string& weight_var_name,
-    Scope* scope,
-    const std::vector<Place>& valid_places,
-    std::unordered_set<Node*> in_data_vars,
-    std::unordered_set<Node*> in_wgt_vars,
-    std::unordered_set<Node*> out_data_vars,
-    std::unordered_set<Node*> out_unused_vars) {
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (auto i : in_data_vars) {
-    in_var_names.push_back(i->AsArg().name);
-  }
-  for (auto i : out_data_vars) {
-    out_var_names.push_back(i->AsArg().name);
-  }
-  auto op_desc = GenGraphOpDesc(weight_var_name, in_var_names, out_var_names);
-  auto graph_op = LiteOpRegistry::Global().Create("graph_op");
-  graph_op->Attach(op_desc, scope);
-  auto* new_op_node = graph->GraphCreateInstructNode(graph_op, valid_places);
-  for (auto& in_var : in_data_vars) {
-    IR_NODE_LINK_TO(in_var, new_op_node);
-  }
-  for (auto& in_var : in_wgt_vars) {
-    IR_NODE_LINK_TO(in_var, new_op_node);
-  }
-  for (auto& out_var : out_data_vars) {
-    IR_OP_VAR_LINK(new_op_node, out_var);
-  }
-  for (auto& out_var : out_unused_vars) {
-    IR_OP_VAR_LINK(new_op_node, out_var);
-  }
-  // add weight node to store pre-compilied NPU model
-  auto new_weight_node = graph->NewArgumentNode(weight_var_name);
-  new_weight_node->AsArg().is_weight = true;
-  new_weight_node->AsArg().is_persist = true;
-  DirectedLink(new_weight_node, new_op_node);
-  // assign context
-  auto& inst = new_op_node->AsStmt();
-  inst.picked_kernel().SetContext(
-      ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
-}
-void SubgraphProgramPass::SortHelper(
-    Node* node,
-    const std::unordered_set<Node*>& nodes_all,
-    std::unordered_set<const Node*>* visited_nodes,
-    std::vector<Node*>* ret) {
-  for (auto& var_node : node->inlinks) {
-    if (var_node->inlinks.empty()) continue;
-    auto* op_node = var_node->inlinks.front();
-    if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) {
-      SortHelper(op_node, nodes_all, visited_nodes, ret);
-    }
-  }
-  ret->push_back(node);
-  visited_nodes->insert(node);
-}
-std::vector<Node*> SubgraphProgramPass::GetTopologicalOrder(
-    const std::unordered_set<Node*>& nodes) {
-  std::unordered_set<const Node*> visited;
-  std::vector<Node*> ret;
-  for (auto& node : nodes) {
-    if (!node->IsStmt()) continue;
-    if (visited.count(node)) continue;
-    SortHelper(node, nodes, &visited, &ret);
-  }
-  return ret;
-}
-void SubgraphProgramPass::FindInputOutputVars(
-    const std::unordered_set<Node*>& op_nodes,
-    std::unordered_set<Node*>* in_data_vars,
-    std::unordered_set<Node*>* in_wgt_vars,
-    std::unordered_set<Node*>* out_data_vars,
-    std::unordered_set<Node*>* out_unused_vars) {
-  for (auto& op_node : op_nodes) {
-    for (auto& in_var : op_node->inlinks) {
-      if (in_var->AsArg().is_weight) {
-        in_wgt_vars->insert(in_var);
-        continue;
-      }
-      if (!in_var->inlinks.empty()) {
-        // var can only come from one op node, so use front
-        auto* pre_op_node = in_var->inlinks.front();
-        if (op_nodes.count(pre_op_node)) {
-          continue;
-        }
-      }
-      in_data_vars->insert(in_var);
-    }
-    for (auto& out_var : op_node->outlinks) {
-      if (out_var->outlinks.empty()) {
-        // the next op is empty so this var is actually unused
-        out_unused_vars->insert(out_var);
-        continue;
-      }
-      // var can have more than one next op node
-      // so, if any one in the op_nodes then continue
-      bool next_op_in_nodes = false;
-      for (auto& next_op_node : out_var->outlinks) {
-        if (op_nodes.count(next_op_node)) {
-          next_op_in_nodes = true;
-        }
-      }
-      if (next_op_in_nodes) {
-        continue;
-      }
-      out_data_vars->insert(out_var);
-    }
-  }
-}
-std::unordered_set<const Node*> SubgraphProgramPass::GetNode2rm(
-    const std::unordered_set<Node*>& op_nodes,
-    const std::vector<std::unordered_set<Node*>>& excluded_nodes) {
-  std::unordered_set<const Node*> nodes2rm(op_nodes.begin(), op_nodes.end());
-  for (auto& op_node : op_nodes) {
-    for (auto& in_var : op_node->inlinks) {
-      if (!nodes2rm.count(in_var)) {
-        nodes2rm.insert(in_var);
-      }
-    }
-    for (auto& out_var : op_node->outlinks) {
-      if (!nodes2rm.count(out_var)) {
-        nodes2rm.insert(out_var);
-      }
-    }
-  }
-  // some nodes should not be removed
-  for (auto& e : excluded_nodes) {
-    for (auto& i : e) {
-      if (nodes2rm.count(i)) {
-        nodes2rm.erase(i);
-      }
-    }
-  }
-  return nodes2rm;
-}
-void SubgraphProgramPass::InferOnce(const std::unique_ptr<SSAGraph>& graph) {
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    auto& op = stmt.op();
-    auto scope = op->scope();
-    std::string op_type = op->op_info()->Type();
-    // check the dimension of input variables in the scope, must not be empty !
-    if (op_type == "feed") {
-      auto input_var_names = op->op_info()->output_names();
-      CHECK_GE(input_var_names.size(), 1);
-      for (auto input_var_name : input_var_names) {
-        auto input_var = scope->FindVar(input_var_name);
-        CHECK(input_var) << "No input variable '" << input_var_name
-                         << "' found in scope " << scope;
-        auto input = input_var->GetMutable<lite::Tensor>();
-        CHECK(!input->dims().empty()) << "The dimension of input variable '"
-                                      << input_var_name
-                                      << "' can not be empty.";
-      }
-      continue;
-    }
-    if (op_type == "fetch") {
-      continue;
-    }
-    op->CheckShape();
-    op->InferShape();
-#ifndef LITH_WITH_XPU
-    // TOOD(xxx): remove Launch() at last
-    auto& kkks = stmt.kernels();
-    if (!kkks.empty()) {
-      auto& kk = stmt.kernels().front();
-      if (kk) {
-        kk->Launch();
-      }
-    }
-#endif
-  }
-}
-void SubgraphProgramPass::InitSubgraphID(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::vector<std::string>& supported_op_types) {
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    stmt.ClearSubgraphID();
-    if (std::find(supported_op_types.begin(),
-                  supported_op_types.end(),
-                  stmt.op_type()) != supported_op_types.end()) {
-      stmt.SetSubgraphID(0);
-      LOG(INFO) << "supported " << stmt.op_type();
-    } else {
-      LOG(INFO) << "======= not supported " << stmt.op_type();
-    }
-  }
-}
-// mark current and all output supported nodes
-void SubgraphProgramPass::ChangeAllOutConnectedID(Node* node,
-                                                  int to_id,
-                                                  int from_id) {
-  if (!node) return;
-  if (node->IsStmt()) {
-    auto& stmt = node->AsStmt();
-    if (stmt.subgraph_id() == from_id) {
-      stmt.SetSubgraphID(to_id);
-      for (auto& i : node->outlinks) {
-        ChangeAllOutConnectedID(i, to_id, from_id);
-      }
-    } else {
-      LOG(INFO) << "failed op type:" << stmt.op_type();
-      return;
-    }
-  } else {
-    // this it arg node
-    bool all_out_op_supported = true;
-    for (auto& i : node->outlinks) {
-      if (!i->IsStmt()) return;
-      auto& stmt = i->AsStmt();
-      if (stmt.subgraph_id() < from_id) {
-        all_out_op_supported = false;
-      }
-    }
-    if (!all_out_op_supported) {
-      return;
-    }
-    for (auto& i : node->outlinks) {
-      CHECK(i->IsStmt());
-      auto& stmt = i->AsStmt();
-      if (stmt.subgraph_id() == from_id) {
-        stmt.SetSubgraphID(to_id);
-        for (auto& o : i->outlinks) {
-          ChangeAllOutConnectedID(o, to_id, from_id);
-        }
-      }
-    }
-  }
-}
-int SubgraphProgramPass::FuseSubgraphID(
-    const std::unique_ptr<SSAGraph>& graph) {
-  int sub_id = 1;  // id start from 1 not 0
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    // bool inputvar = false;
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    /*
-    if (stmt.subgraph_id() == -1) {
-      for (auto& i : item->outlinks) {
-        for (auto& j : i->outlinks) {
-          if (j->IsStmt()) {
-            auto& jstmt = j->AsStmt();
-            if (jstmt.subgraph_id() == 0) inputvar = true;
-          }
-        }
-      }
-    }
-    */
-    if (stmt.subgraph_id() != 0) continue;
-    ChangeAllOutConnectedID(item, sub_id);
-    sub_id++;
-  }
-  return sub_id - 1;
-}
-int SubgraphProgramPass::FuseSubgraph(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::vector<std::string>& supported_op_types) {
-  InitSubgraphID(graph, supported_op_types);
-  return FuseSubgraphID(graph);
-}
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-REGISTER_MIR_PASS(subgraph_program_pass,
-                  paddle::lite::mir::subgraph::SubgraphProgramPass)
-    .BindTargets({TARGET(kAny)});
--- a/lite/core/mir/subgraph/subgraph_program_pass.h
+++ b/lite/core/mir/subgraph/subgraph_program_pass.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "lite/core/mir/pass.h"
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-class SubgraphProgramPass : public ProgramPass {
- public:
-  using key2nodes_t = std::map<std::string, Node*>;
-  // make all the linked ops in subgraph with same subgraph_id
-  // return the fused subgraph numbers
-  int FuseSubgraph(const std::unique_ptr<SSAGraph>& graph,
-                   const std::vector<std::string>& supported_op_types);
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override{};
- protected:
-  void InferOnce(const std::unique_ptr<SSAGraph>& graph);
-  // clear all subgraph id and mark all ops, which could be fuse, as id zero
-  void InitSubgraphID(const std::unique_ptr<SSAGraph>& graph,
-                      const std::vector<std::string>& supported_op_types);
-  // make all the linked ops in subgraph with same subgraph_id
-  // return the fused subgraph numbers
-  int FuseSubgraphID(const std::unique_ptr<SSAGraph>& graph);
-  // // GenerateFusedGraph:
-  // std::unique_ptr<SSAGraph> GenerateFusedGraph(const
-  // std::unique_ptr<SSAGraph>& graph, int sub_num);
-  void ChangeAllOutConnectedID(Node* node, int to_id, int from_id = 0);
-  // Below function cloud be useful in child classes //
-  // classify node by subgraph id
-  std::unordered_map<int, std::unordered_set<Node*>> ClassifySubgraph(
-      const std::unique_ptr<SSAGraph>& graph);
-  // generate the graph op desc
-  cpp::OpDesc GenGraphOpDesc(const std::string& weight_var_name,
-                             const std::vector<std::string>& in_var_names,
-                             const std::vector<std::string>& out_var_names);
-  // insert a new graph op node
-  void InsertNewNode(const std::unique_ptr<SSAGraph>& graph,
-                     const std::string& weight_var_name,
-                     Scope* scope,
-                     const std::vector<Place>& valid_places,
-                     std::unordered_set<Node*> in_data_vars,
-                     std::unordered_set<Node*> in_wgt_vars,
-                     std::unordered_set<Node*> out_data_vars,
-                     std::unordered_set<Node*> out_unused_vars);
-  // Sort and return the topology order of nodes set
-  std::vector<Node*> GetTopologicalOrder(
-      const std::unordered_set<Node*>& nodes);
-  // find all input data vars, input weight vars,
-  // output data vars and output vars from the nodes
-  void FindInputOutputVars(const std::unordered_set<Node*>& op_nodes,
-                           std::unordered_set<Node*>* in_data_vars,
-                           std::unordered_set<Node*>* in_wgt_vars,
-                           std::unordered_set<Node*>* out_data_vars,
-                           std::unordered_set<Node*>* out_unused_vars);
-  // return the node to remove in the subgraph
-  std::unordered_set<const Node*> GetNode2rm(
-      const std::unordered_set<Node*>& op_nodes,
-      const std::vector<std::unordered_set<Node*>>& excluded_nodes);
- private:
-  // sort nodes to operational sequence
-  void SortHelper(Node* node,
-                  const std::unordered_set<Node*>& nodes_all,
-                  std::unordered_set<const Node*>* visited_nodes,
-                  std::vector<Node*>* ret);
-};
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -27,12 +27,6 @@
 #include "lite/core/program.h"
 #include "lite/core/types.h"
 #include "lite/model_parser/model_parser.h"
-#ifdef LITE_WITH_NPU
-#include "lite/core/mir/subgraph/generate_npu_program_pass.h"
-#endif
-#ifdef LITE_WITH_XPU
-#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
-#endif
 namespace paddle {
 namespace lite {
@@ -109,7 +103,9 @@ class Optimizer {
           "runtime_context_assign_pass",
           "argument_type_display_pass",
-           "memory_optimize_pass"}};
+           "memory_optimize_pass",
+           "npu_subgraph_pass",
+           "xpu_subgraph_pass"}};
      RunPasses(passes_local);
    } else {
      RunPasses(passes);
@@ -121,13 +117,6 @@ class Optimizer {
  // Generate a new program based on the mir graph.
  std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
-    // Extra passes are applied for NPU and XPU, they depends on the shapes
-    // of input tensors. so GenRuntimeProgram() must be called after the shapes
-    // of input tensors are determined.
-    std::vector<std::string> subgraph_passes{"generate_npu_program_pass",
-                                             "generate_xpu_program_pass"};
-    RunPasses(subgraph_passes);
    auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
        "generate_program_pass");
    pass->Apply(graph_);

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -18,6 +18,7 @@
 #include "lite/model_parser/cpp/op_desc.h"
 #include "lite/model_parser/cpp/var_desc.h"
 #include "lite/operators/conditional_block_op.h"
+#include "lite/operators/subgraph_op.h"
 #include "lite/operators/while_op.h"
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/precision_profiler.h"
@@ -31,10 +32,32 @@ void RuntimeProgram::SaveOpInfosToProgram(cpp::ProgramDesc* desc) {
  // NOTE: RuntimeProgram do not has all meta info, so save model just update
  // upon origin model
  CHECK(desc->BlocksSize());
-  auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
+  auto main_block = desc->GetBlock<cpp::BlockDesc>(0);
-  main_block.ClearOps();
+  main_block->ClearOps();
  for (auto& node : instructions_) {
-    auto* op = main_block.AddOp<cpp::OpDesc>();
+    auto op_type = node.op()->op_info()->Type();
+    if (op_type == "subgraph") {
+      auto subgraph_op = const_cast<operators::SubgraphOp*>(
+          static_cast<const operators::SubgraphOp*>(node.op()));
+      int sub_block_idx = subgraph_op->op_info()->GetAttr<int32_t>("sub_block");
+      if (sub_block_idx < 0) {
+        // It's a new subgraph op when its sub_block_idx < 0, Now we add its
+        // subblock desc to the program desc, Then update its sub_block_idx to
+        // the index of block desc of the program desc.
+        sub_block_idx = desc->BlocksSize();
+        auto sub_block_desc = subgraph_op->GetSubBlock();
+        CHECK(sub_block_desc);
+        auto new_block_desc = desc->AddBlock<cpp::BlockDesc>();
+        *new_block_desc = *sub_block_desc;
+        delete sub_block_desc;
+        subgraph_op->mutable_op_info()->SetAttr<int32_t>("sub_block",
+                                                         sub_block_idx);
+        subgraph_op->SetSubBlock(new_block_desc);
+        // Update main block desc after a new subblock desc is added
+        main_block = desc->GetBlock<cpp::BlockDesc>(0);
+      }
+    }
+    auto op = main_block->AddOp<cpp::OpDesc>();
    *op = *node.op()->op_info();
    op->SetAttr(kKernelTypeAttr, node.kernel()->SerializedKernelType());
  }
@@ -142,16 +165,25 @@ void Program::Build(const cpp::ProgramDesc& prog) {
    VLOG(4) << "create Op [" << op_type << "]";
    auto op = LiteOpRegistry::Global().Create(op_type);
    CHECK(op) << "no Op found for " << op_type;
-    if (op_type == "while" || op_type == "conditional_block") {
+    if (op_type == "while" || op_type == "conditional_block" ||
+        op_type == "subgraph") {
      auto sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
-      auto sub_block =
+      CHECK(sub_block_idx >= 0 && sub_block_idx < program.BlocksSize())
+          << "Invalid attribute sub_block(" << sub_block_idx << ") for "
+          << op_type;
+      auto sub_block_desc =
          const_cast<cpp::ProgramDesc&>(prog).GetBlock<cpp::BlockDesc>(
              sub_block_idx);
+      CHECK(sub_block_desc);
      if (op_type == "while") {
-        static_cast<operators::WhileOpLite*>(op.get())->SetSubBlock(sub_block);
+        static_cast<operators::WhileOpLite*>(op.get())->SetSubBlock(
+            sub_block_desc);
      } else if (op_type == "conditional_block") {
        static_cast<operators::ConditionalBlockOpLite*>(op.get())->SetSubBlock(
-            sub_block);
+            sub_block_desc);
+      } else if (op_type == "subgraph") {
+        static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(
+            sub_block_desc);
      }
    }
    ops_.emplace_back(std::move(op));

--- a/lite/kernels/npu/CMakeLists.txt
+++ b/lite/kernels/npu/CMakeLists.txt
+add_subdirectory(bridges)
-if(NOT LITE_WITH_NPU)
+add_kernel(subgraph_compute_npu NPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_npu subgraph_bridge_engine ${npu_subgraph_bridges})
-  return ()
-endif()
-message(STATUS "compile with lite NPU kernels")
-add_kernel(graph_compute_npu NPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} npu_runtime)
-# lite_cc_test(test_graph_compute_npu SRCS graph_compute_test.cc DEPS graph_compute_npu)
-if(NOT LITE_ON_TINY_PUBLISH)
-    add_subdirectory(bridges)
-endif()
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
-lite_cc_library(npu_bridge_registry SRCS registry.cc)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU)
+  return()
+endif()
-set(npu_bridge_deps npu_bridge_registry npu_builder op)
+lite_cc_library(subgraph_bridge_registry
+    SRCS registry.cc
+    DEPS op)
+lite_cc_library(subgraph_bridge_engine
+    SRCS engine.cc
+    DEPS tensor op scope program)
-lite_cc_library(npu_bridge_fc_op SRCS fc_op.cc DEPS ${npu_bridge_deps})
+if(NOT LITE_WITH_NPU)
-lite_cc_library(npu_bridge_conv_op SRCS conv_op.cc DEPS ${npu_bridge_deps})
+  return()
-lite_cc_library(npu_bridge_mul_op SRCS mul_op.cc DEPS ${npu_bridge_deps})
+endif()
-lite_cc_library(npu_bridge_act_op SRCS act_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_scale_op SRCS scale_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_softmax_op SRCS softmax_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_pool_op SRCS pool_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_batch_norm_op SRCS batch_norm_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_elementwise_ops SRCS elementwise_ops.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_reshape_op SRCS reshape_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_conv_transpose_op SRCS conv_transpose_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_interpolate_op SRCS interpolate_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_transpose_op SRCS transpose_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_split_op SRCS split_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_concat_op SRCS concat_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_square_op SRCS square_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_sqrt_op SRCS sqrt_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_reduce_mean_op SRCS reduce_mean_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_unsqueeze_op SRCS unsqueeze_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_argmax_op SRCS argmax_op.cc DEPS ${npu_bridge_deps})
-set(npu_bridges
+lite_cc_library(subgraph_bridge_utility_npu SRCS utility.cc DEPS ${npu_builder_libs} tensor)
-        npu_bridge_registry
+lite_cc_library(subgraph_bridge_graph_npu SRCS graph.cc DEPS subgraph_bridge_utility_npu)
-        npu_bridge_fc_op
-        npu_bridge_conv_op
-        npu_bridge_mul_op
-        npu_bridge_act_op
-        npu_bridge_scale_op
-        npu_bridge_softmax_op
-        npu_bridge_pool_op
-        npu_bridge_batch_norm_op
-        npu_bridge_elementwise_ops
-        npu_bridge_reshape_op
-        npu_bridge_conv_transpose_op
-        npu_bridge_interpolate_op
-        npu_bridge_transpose_op
-        npu_bridge_split_op
-        npu_bridge_concat_op
-        npu_bridge_shuffle_channel_op
-        npu_bridge_pad2d_op
-        npu_bridge_square_op
-        npu_bridge_sqrt_op
-        npu_bridge_reduce_mean_op
-        npu_bridge_unsqueeze_op
-        npu_bridge_argmax_op
-        CACHE INTERNAL "npu_bridges")
-set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops})
+set(npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_npu subgraph_bridge_graph_npu)
-lite_cc_test(test_npu_bridge_fc_op SRCS fc_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_fc_op_npu SRCS fc_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_conv_op_npu SRCS conv_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_mul_op SRCS mul_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_mul_op_npu SRCS mul_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_act_op SRCS act_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_act_op_npu SRCS act_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_scale_op SRCS scale_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_scale_op_npu SRCS scale_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_softmax_op SRCS softmax_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_softmax_op_npu SRCS softmax_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_pool_op SRCS pool_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_pool_op_npu SRCS pool_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_batch_norm_op SRCS batch_norm_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_npu SRCS batch_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_elementwise_ops SRCS elementwise_ops_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_npu SRCS elementwise_ops.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_reshape_op SRCS reshape_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_reshape_op_npu SRCS reshape_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_conv_transpose_op SRCS conv_transpose_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_conv_transpose_op_npu SRCS conv_transpose_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_interpolate_op SRCS interpolate_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_interpolate_op_npu SRCS interpolate_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_transpose_op SRCS transpose_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_transpose_op_npu SRCS transpose_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_split_op SRCS split_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_split_op_npu SRCS split_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_concat_op SRCS concat_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_concat_op_npu SRCS concat_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_shuffle_channel_op_npu SRCS shuffle_channel_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_pad2d_op_npu SRCS pad2d_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_square_op SRCS square_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_square_op_npu SRCS square_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_sqrt_op SRCS sqrt_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_sqrt_op_npu SRCS sqrt_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_reduce_mean_op SRCS reduce_mean_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_reduce_mean_op_npu SRCS reduce_mean_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_unsqueeze_op SRCS unsqueeze_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_test(test_npu_bridge_argmax_op SRCS argmax_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps})
-message(STATUS "+++++ npu_bridges: ${npu_bridges}")
+set(npu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_npu
+        subgraph_bridge_graph_npu
+        subgraph_bridge_fc_op_npu
+        subgraph_bridge_conv_op_npu
+        subgraph_bridge_mul_op_npu
+        subgraph_bridge_act_op_npu
+        subgraph_bridge_scale_op_npu
+        subgraph_bridge_softmax_op_npu
+        subgraph_bridge_pool_op_npu
+        subgraph_bridge_batch_norm_op_npu
+        subgraph_bridge_elementwise_ops_npu
+        subgraph_bridge_reshape_op_npu
+        subgraph_bridge_conv_transpose_op_npu
+        subgraph_bridge_interpolate_op_npu
+        subgraph_bridge_transpose_op_npu
+        subgraph_bridge_split_op_npu
+        subgraph_bridge_concat_op_npu
+        subgraph_bridge_shuffle_channel_op_npu
+        subgraph_bridge_pad2d_op_npu
+        subgraph_bridge_square_op_npu
+        subgraph_bridge_sqrt_op_npu
+        subgraph_bridge_reduce_mean_op_npu
+        subgraph_bridge_unsqueeze_op_npu
+        subgraph_bridge_argmax_op_npu
+        CACHE INTERNAL "npu_subgraph_bridges")
+message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}")
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
@@ -12,34 +12,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
+int ActConverter(void* ctx, OpLite* op) {
-                           const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  auto scope = act_op->scope();
+  CHECK(op != nullptr);
-  auto op_info = act_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
-  // create act node and set input node from inputs_map
+  // Create act node and set input node which is obtained from the node map
  auto x_var_name = op_info->Input("X").front();
-  auto act_node = std::make_shared<ge::op::Activation>(unique_op_type);
+  auto out_var_name = op_info->Output("Out").front();
-  CHECK(inputs_map.count(x_var_name));
+  auto act_node = graph->AddNode<ge::op::Activation>(out_var_name);
-  act_node->set_input_x(*inputs_map.at(x_var_name));
+  act_node->set_input_x(*graph->GetNode(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(act_node);
  // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
  // clipped_relu etc.
-  act_node->set_attr_mode(lite::npu::CvtActMode(op_type));
+  act_node->set_attr_mode(CvtActMode(op_type));
  if (op_type == "relu_clipped") {
    auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
@@ -56,31 +54,33 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
    act_node->set_attr_negative_slope(slope);
    act_node->set_attr_coef(offset);
  }
+  return SUCCESS;
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = act_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(sigmoid, paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-REGISTER_NPU_BRIDGE(relu, paddle::lite::kernels::npu::bridges::ActConverter);
+                         sigmoid,
-REGISTER_NPU_BRIDGE(tanh, paddle::lite::kernels::npu::bridges::ActConverter);
+                         paddle::lite::subgraph::npu::ActConverter);
-REGISTER_NPU_BRIDGE(relu_clipped,
+REGISTER_SUBGRAPH_BRIDGE(NPU, relu, paddle::lite::subgraph::npu::ActConverter);
-                    paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, tanh, paddle::lite::subgraph::npu::ActConverter);
-REGISTER_NPU_BRIDGE(relu6, paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-// REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter);
+                         relu_clipped,
-REGISTER_NPU_BRIDGE(leaky_relu,
+                         paddle::lite::subgraph::npu::ActConverter);
-                    paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, relu6, paddle::lite::subgraph::npu::ActConverter);
-REGISTER_NPU_BRIDGE(abs, paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-REGISTER_NPU_BRIDGE(softsign,
+                         leaky_relu,
-                    paddle::lite::kernels::npu::bridges::ActConverter);
+                         paddle::lite::subgraph::npu::ActConverter);
-REGISTER_NPU_BRIDGE(softplus,
+REGISTER_SUBGRAPH_BRIDGE(NPU, abs, paddle::lite::subgraph::npu::ActConverter);
-                    paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-REGISTER_NPU_BRIDGE(hard_sigmoid,
+                         softsign,
-                    paddle::lite::kernels::npu::bridges::ActConverter);
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         softplus,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         hard_sigmoid,
+                         paddle::lite::subgraph::npu::ActConverter);
--- a/lite/kernels/npu/bridges/argmax_op.cc
+++ b/lite/kernels/npu/bridges/argmax_op.cc
@@ -12,59 +12,41 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type ArgmaxConverter(const std::shared_ptr<lite::OpLite> argmax_op,
+int ArgmaxConverter(void* ctx, OpLite* op) {
-                              const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  auto scope = argmax_op->scope();
+  CHECK(op != nullptr);
-  auto op_info = argmax_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  int axis = op_info->GetAttr<int64_t>("axis");
-  std::shared_ptr<ge::op::ArgMax> argmax_node =
-      std::make_shared<ge::op::ArgMax>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  int axis = op_info->GetAttr<int64_t>("axis");
-  CHECK(inputs_map.count(x_var_name));
+  auto argmax_node = graph->AddNode<ge::op::ArgMax>(out_var_name);
-  argmax_node->set_input_x1(*inputs_map.at(x_var_name));
+  argmax_node->set_input_x1(*graph->GetNode(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(argmax_node);
-  Tensor x2_t;
-  x2_t.Resize(std::vector<int64_t>{1});
-  auto x2_t_data = x2_t.mutable_data<int>();
-  x2_t_data[0] = axis;
-  auto x2 = std::make_shared<ge::op::Const>(unique_op_type + "/axis");
+  auto x2 = graph->AddNode(out_var_name + "/axis", axis);
-  x2->set_attr_value(lite::npu::CvtTensor(&x2_t));
  argmax_node->set_input_x2(*x2);
-  lite::npu::OpList::Global().add(x2);
+  return SUCCESS;
-  //  argmax_node->set_attr_axis(axis);
-  // argmax only support output_type==int32
-  // argmax_node->set_attr_output_type(3);
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = argmax_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(arg_max,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::ArgmaxConverter);
+                         arg_max,
+                         paddle::lite::subgraph::npu::ArgmaxConverter);
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
@@ -12,81 +12,66 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type BatchNormConverter(
+int BatchNormConverter(void* ctx, OpLite* op) {
-    const std::shared_ptr<lite::OpLite> batch_norm_op,
+  CHECK(ctx != nullptr);
-    const node_map_type& inputs_map) {
+  CHECK(op != nullptr);
-  auto scope = batch_norm_op->scope();
+  auto graph = static_cast<Graph*>(ctx);
-  auto op_info = batch_norm_op->op_info();
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  std::shared_ptr<ge::op::BatchNormExt2> batch_norm_node =
-      std::make_shared<ge::op::BatchNormExt2>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
+  auto y_var_name = op_info->Output("Y").front();
+  auto batch_norm_node = graph->AddNode<ge::op::BatchNormExt2>(y_var_name);
+  batch_norm_node->set_input_x(*graph->GetNode(x_var_name));
  auto scale_var_name = op_info->Input("Scale").front();
-  lite::Tensor* scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
+  auto scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
-  auto npu_scale = std::make_shared<ge::op::Const>(scale_var_name);
+  auto scale_const_node = graph->AddNode(scale_var_name, *scale);
-  npu_scale->set_attr_value(lite::npu::CvtTensor(scale));
-  lite::npu::OpList::Global().add(npu_scale);
  auto bias_var_name = op_info->Input("Bias").front();
-  lite::Tensor* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
+  auto bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
-  auto npu_bias = std::make_shared<ge::op::Const>(bias_var_name);
+  auto bias_const_node = graph->AddNode(bias_var_name, *bias);
-  npu_bias->set_attr_value(lite::npu::CvtTensor(bias));
-  lite::npu::OpList::Global().add(npu_bias);
  auto mean_var_name = op_info->Input("Mean").front();
-  lite::Tensor* mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
+  auto mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
-  auto npu_mean = std::make_shared<ge::op::Const>(mean_var_name);
+  auto mean_const_node = graph->AddNode(mean_var_name, *mean);
-  npu_mean->set_attr_value(lite::npu::CvtTensor(mean));
-  lite::npu::OpList::Global().add(npu_mean);
  auto variance_var_name = op_info->Input("Variance").front();
-  lite::Tensor* variance =
+  auto variance = scope->FindVar(variance_var_name)->GetMutable<Tensor>();
-      scope->FindVar(variance_var_name)->GetMutable<Tensor>();
+  auto variance_const_node = graph->AddNode(variance_var_name, *variance);
-  auto npu_variance = std::make_shared<ge::op::Const>(variance_var_name);
-  npu_variance->set_attr_value(lite::npu::CvtTensor(variance));
-  lite::npu::OpList::Global().add(npu_variance);
-  float npu_momentum = op_info->GetAttr<float>("momentum");
+  float momentum = op_info->GetAttr<float>("momentum");
-  float npu_epsilon = op_info->GetAttr<float>("epsilon");
+  float epsilon = op_info->GetAttr<float>("epsilon");
-  int npu_mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
+  int mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
-  bool npu_use_global_stats = op_info->GetAttr<bool>("use_global_stats");
+  bool use_global_stats = op_info->GetAttr<bool>("use_global_stats");
-  batch_norm_node->set_input_x(*inputs_map.at(x_var_name));
+  batch_norm_node->set_input_scale(*scale_const_node);
-  batch_norm_node->set_input_scale(*npu_scale);
+  batch_norm_node->set_input_offset(*bias_const_node);
-  batch_norm_node->set_input_offset(*npu_bias);
+  batch_norm_node->set_input_mean(*mean_const_node);
-  batch_norm_node->set_input_mean(*npu_mean);
+  batch_norm_node->set_input_variance(*variance_const_node);
-  batch_norm_node->set_input_variance(*npu_variance);
+  batch_norm_node->set_attr_momentum(momentum);
-  batch_norm_node->set_attr_momentum(npu_momentum);
+  batch_norm_node->set_attr_epsilon(epsilon);
-  batch_norm_node->set_attr_epsilon(npu_epsilon);
+  batch_norm_node->set_attr_mode(mode);
-  batch_norm_node->set_attr_mode(npu_mode);
+  batch_norm_node->set_attr_use_global_stats(use_global_stats);
-  batch_norm_node->set_attr_use_global_stats(npu_use_global_stats);
+  return SUCCESS;
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(batch_norm_node);
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Y").front()] = batch_norm_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(batch_norm,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::BatchNormConverter);
+                         batch_norm,
+                         paddle::lite::subgraph::npu::BatchNormConverter);
--- a/lite/kernels/npu/bridges/concat_op.cc
+++ b/lite/kernels/npu/bridges/concat_op.cc
@@ -12,58 +12,51 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type ConcatConverter(const std::shared_ptr<lite::OpLite> concat_op,
+int ConcatConverter(void* ctx, OpLite* op) {
-                              const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  lite::Scope* scope = concat_op->scope();
+  CHECK(op != nullptr);
-  const lite::OpInfo* op_info = concat_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " << op_type << " ... ";
+  VLOG(3) << "[NPU] Converting " << op_type << " ... ";
  auto x_var_names = op_info->Input("X");
+  auto out_var_name = op_info->Output("Out").front();
  auto axis = op_info->GetAttr<int>("axis");
-  int num = x_var_names.size();
+  auto num = x_var_names.size();
-  int index = 0;
+  auto concat_node = graph->AddNode<ge::op::Concat>(out_var_name);
+  concat_node->set_attr_axis(axis);
-  std::shared_ptr<ge::op::Concat> output_node =
+  concat_node->set_attr_N(num);
-      std::make_shared<ge::op::Concat>(unique_op_type);
+  concat_node->create_dynamic_input_x(num);
-  output_node->set_attr_axis(axis);
+  int idx = 1;
-  output_node->set_attr_N(num);
+  for (auto& x_var_name : x_var_names) {
-  output_node->create_dynamic_input_x(num);
+    if (graph->HasNode(x_var_name)) {
-  for (auto x_var_name : x_var_names) {
+      concat_node->set_dynamic_input_x(idx, *graph->GetNode(x_var_name));
-    if (inputs_map.find(x_var_name) != inputs_map.end()) {
-      output_node->set_dynamic_input_x(index + 1, *inputs_map.at(x_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
    } else {
-      auto consty = std::make_shared<ge::op::Const>(x_var_name);
+      auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
-      auto* x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+      auto x_const_node = graph->AddNode(x_var_name, *x);
-      consty->set_attr_value(lite::npu::CvtTensor(x));
+      concat_node->set_dynamic_input_x(idx, *x_const_node);
-      output_node->set_dynamic_input_x(index + 1, *consty);
-      lite::npu::OpList::Global().add(consty);
    }
-    index++;
+    idx++;
  }
-  lite::npu::OpList::Global().add(output_node);
+  return SUCCESS;
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(concat,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::ConcatConverter);
+                         concat,
+                         paddle::lite::subgraph::npu::ConcatConverter);
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
@@ -13,32 +13,33 @@
 // limitations under the License.
 #include "lite/operators/conv_op.h"
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
+int ConvConverter(void* ctx, OpLite* op) {
-                            const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  auto scope = conv_op->scope();
+  CHECK(op != nullptr);
-  auto op_info = conv_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " << op_type << "... ";
+  VLOG(3) << "[NPU] Converting " << op_type << "... ";
-  // get input, filter and op attributes
+  // Get input, filter and op attributes
  auto input_var_name = op_info->Input("Input").front();
-  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input = scope->FindVar(input_var_name)->GetMutable<Tensor>();
  auto input_dims = input->dims();
  auto output_var_name = op_info->Output("Output").front();
-  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
  auto output_dims = output->dims();
  auto filter_var_name = op_info->Input("Filter").front();
-  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+  auto filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
  auto filter_dims = filter->dims();
  auto bs = input_dims[0];
  auto ic = input_dims[1];
@@ -63,7 +64,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
    }
  }
  CHECK_EQ(paddings.size(), 4L)
-      << "Paddings size should be the same or twice as the input size.";
+      << "[NPU] Paddings size should be the same or twice as the input size.";
  std::string padding_algorithm("");
  if (op_info->HasAttr("padding_algorithm")) {
@@ -76,9 +77,9 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
                                      input_dims,
                                      filter_dims);
-  // check depthwise mode, and decide whether use ConvolutionDepthwise Op
+  // Check depthwise mode, and decide whether use ConvolutionDepthwise Op
  bool use_depthwise_conv =
-      false;  // whether use ge::op::ConvolutionDepthwise ?
+      false;  // Whether use ge::op::ConvolutionDepthwise ?
  bool is_depthwise_mode = ic == groups && oc == groups;
  if (is_depthwise_mode &&
      !((groups == 1 || groups >= 5) && dilations[0] == 1 &&
@@ -90,26 +91,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
                    "performance.";
  }
-  // check input
+  // Create filter node
-  CHECK(inputs_map.count(input_var_name));
+  auto filter_const_node = graph->AddNode(filter_var_name, *filter);
-  lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
-  // create filter node
+  // Create bias node if exists bias
-  CHECK(!inputs_map.count(filter_var_name));
+  // Supports the bias nodes with the following dimensions
-  auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
-  filter_const_node->set_attr_value(lite::npu::CvtTensor(filter));
-  lite::npu::OpList::Global().add(filter_const_node);
-  // create bias node if has bias
-  // supports the bias nodes with the following dimensions
  // 0: {oc}
  // 1: {1, oc, oh, ow}
  // 2: {n, oc, oh, ow}
  std::shared_ptr<ge::Operator> bias_node = nullptr;
  bool is_channel_bias = false;
-  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
+  if (HasInputArg(op_info, scope, "Bias")) {
    auto bias_var_name = op_info->Input("Bias").front();
-    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
    auto bias_dims = bias->dims();
    auto bias_data_size = bias_dims.production();
    auto output_data_size = output_dims.production();
@@ -125,28 +119,26 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
      // 2: {n, oc, oh, ow}
      bias_shape = output_dims.Vectorize();
    } else {
-      LOG(ERROR) << "bias dimension " << bias_dims
+      LOG(WARNING) << "[NPU] Bias dimension " << bias_dims
-                 << " isn't supported in conv2d Op when output dimension is "
+                   << " isn't supported in conv2d Op when output dimension is "
-                 << output_dims;
+                   << output_dims;
+      return FAILED;
    }
-    if (inputs_map.count(bias_var_name)) {
+    if (graph->HasNode(bias_var_name)) {
-      // bias node from input map
+      // Bias node from input map
-      bias_node = inputs_map.at(bias_var_name);
+      bias_node = graph->GetNode(bias_var_name);
    } else {
-      // bias node with const data
+      // Bias node with const data
-      auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
+      bias_node = graph->AddNode(bias_var_name, *bias, bias_shape);
-      bias_const_node->set_attr_value(lite::npu::CvtTensor(bias, bias_shape));
-      bias_node = bias_const_node;
    }
-    lite::npu::OpList::Global().add(bias_node);
  }
-  // create conv node and set input, filter, bias nodes and attributes
+  // Create conv node and set input, filter, bias nodes and attributes
  std::shared_ptr<ge::Operator> conv_node = nullptr;
  if (use_depthwise_conv && is_depthwise_mode) {
    auto depthwise_conv_node =
-        std::make_shared<ge::op::ConvolutionDepthwise>(unique_op_type);
+        graph->AddNode<ge::op::ConvolutionDepthwise>(output_var_name);
-    depthwise_conv_node->set_input_x(*inputs_map.at(input_var_name));
+    depthwise_conv_node->set_input_x(*graph->GetNode(input_var_name));
    depthwise_conv_node->set_input_filter(*filter_const_node);
    depthwise_conv_node->set_attr_mode(1);
    depthwise_conv_node->set_attr_algo(0);
@@ -161,21 +153,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
        ge::AttrValue::LIST_INT({strides[0], strides[1]}));
    depthwise_conv_node->set_attr_kernel(
        ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    lite::npu::OpList::Global().add(depthwise_conv_node);
    conv_node = depthwise_conv_node;
    // ConvolutionDepthwise Op doesn't support bias, so append Add node to
    // support bias
    if (bias_node != nullptr) {
-      auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
+      auto add_node = graph->AddNode<ge::op::Add>(output_var_name);
      add_node->set_input_x1(*depthwise_conv_node);
      add_node->set_input_x2(*bias_node);
-      lite::npu::OpList::Global().add(add_node);
      conv_node = add_node;
    }
  } else {
    auto common_conv_node =
-        std::make_shared<ge::op::Convolution>(unique_op_type);
+        graph->AddNode<ge::op::Convolution>(output_var_name);
-    common_conv_node->set_input_x(*inputs_map.at(input_var_name));
+    common_conv_node->set_input_x(*graph->GetNode(input_var_name));
    common_conv_node->set_input_w(*filter_const_node);
    common_conv_node->set_attr_mode(1);
    common_conv_node->set_attr_pad_mode(0);  // NOTSET
@@ -188,7 +178,6 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
        ge::AttrValue::LIST_INT({strides[0], strides[1]}));
    common_conv_node->set_attr_kernel(
        ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    lite::npu::OpList::Global().add(common_conv_node);
    conv_node = common_conv_node;
    // Convolution Op only support bias with dimension {1, oc, 1, 1},
    // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
@@ -196,37 +185,32 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
      if (is_channel_bias) {
        common_conv_node->set_input_b(*bias_node);
      } else {
-        auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
+        auto add_node = graph->AddNode<ge::op::Add>(output_var_name);
        add_node->set_input_x1(*common_conv_node);
        add_node->set_input_x2(*bias_node);
-        lite::npu::OpList::Global().add(add_node);
        conv_node = add_node;
      }
    }
  }
  CHECK(conv_node);
-  node_map_type outputs_map;
  if (fuse_relu) {
-    // append relu node if fuse_relu is true
+    // Append relu node if fuse_relu is true
-    auto relu_node =
+    auto relu_node = graph->AddNode<ge::op::Activation>(output_var_name);
-        std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
    relu_node->set_input_x(*conv_node);
-    relu_node->set_attr_mode(lite::npu::CvtActMode("relu"));
+    relu_node->set_attr_mode(CvtActMode("relu"));
-    lite::npu::OpList::Global().add(relu_node);
-    outputs_map[op_info->Output("Output").front()] = relu_node;
-  } else {
-    outputs_map[op_info->Output("Output").front()] = conv_node;
  }
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(conv2d, paddle::lite::kernels::npu::bridges::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-REGISTER_NPU_BRIDGE(depthwise_conv2d,
+                         conv2d,
-                    paddle::lite::kernels::npu::bridges::ConvConverter);
+                         paddle::lite::subgraph::npu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         depthwise_conv2d,
+                         paddle::lite::subgraph::npu::ConvConverter);
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
@@ -12,30 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type ConvTransposeConverter(
+int ConvTransposeConverter(void* ctx, OpLite* op) {
-    const std::shared_ptr<lite::OpLite> conv_transpose_op,
+  CHECK(ctx != nullptr);
-    const node_map_type& inputs_map) {
+  CHECK(op != nullptr);
-  auto scope = conv_transpose_op->scope();
+  auto graph = static_cast<Graph*>(ctx);
-  auto op_info = conv_transpose_op->op_info();
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " << op_type << "... ";
+  VLOG(3) << "[NPU] Converting " << op_type << "... ";
-  // get input, output and op attributes
+  // Get input, output and op attributes
  auto input_var_name = op_info->Input("Input").front();
-  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input = scope->FindVar(input_var_name)->GetMutable<Tensor>();
  auto input_shape = input->dims().Vectorize();
+  auto output_var_name = op_info->Output("Output").front();
  auto filter_var_name = op_info->Input("Filter").front();
-  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+  auto filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
  auto filter_shape = filter->dims().Vectorize();
  CHECK_EQ(input_shape.size(), 4);
  CHECK_EQ(filter_shape.size(), 4);
@@ -54,42 +55,34 @@ node_map_type ConvTransposeConverter(
    }
  }
  CHECK_EQ(paddings.size(), 4L)
-      << "Paddings size should be the same or twice as the input size.";
+      << "[NPU] Paddings size should be the same or twice as the input size.";
-  // create deconv node
+  // Create deconv node
  auto conv_transpose_node =
-      std::make_shared<ge::op::Deconvolution>(unique_op_type);
+      graph->AddNode<ge::op::Deconvolution>(output_var_name);
-  // create input sizes node to describe the dimensions of input tensor
+  // Create input sizes node to describe the dimensions of input tensor
-  std::vector<int32_t> output_shape;
+  std::vector<int32_t> input_sizes;
-  output_shape.push_back(input_shape[0]);
+  input_sizes.push_back(input_shape[0]);
-  output_shape.push_back(filter_shape[1] * groups);
+  input_sizes.push_back(filter_shape[1] * groups);
  for (int i = 0; i < strides.size(); i++) {
    int kernel_ext = dilations[i] * (filter_shape[i + 2] - 1) + 1;
    int output_size =
        (input_shape[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i];
-    output_shape.push_back(output_size);
+    input_sizes.push_back(output_size);
  }
  auto input_sizes_const_node =
-      std::make_shared<ge::op::Const>(unique_op_type + "/input_size");
+      graph->AddNode(output_var_name + "/input_sizes", input_sizes);
-  input_sizes_const_node->set_attr_value(
-      lite::npu::CreateTensorAndFillData(output_shape));
  conv_transpose_node->set_input_input_sizes(*input_sizes_const_node);
-  lite::npu::OpList::Global().add(input_sizes_const_node);
-  // create filter node
+  // Create filter node
-  CHECK(!inputs_map.count(filter_var_name));
+  auto filter_const_node = graph->AddNode(filter_var_name, *filter);
-  auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
-  filter_const_node->set_attr_value(lite::npu::CvtTensor(filter));
  conv_transpose_node->set_input_filter(*filter_const_node);
-  lite::npu::OpList::Global().add(filter_const_node);
-  // set input node
+  // Set input node
-  CHECK(inputs_map.count(input_var_name));
+  conv_transpose_node->set_input_x(*graph->GetNode(input_var_name));
-  conv_transpose_node->set_input_x(*inputs_map.at(input_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
-  // set attributes
+  // Set attributes
  conv_transpose_node->set_attr_format(0);    // NCHW
  conv_transpose_node->set_attr_pad_mode(0);  // NOTSET
  conv_transpose_node->set_attr_group(groups);
@@ -101,50 +94,39 @@ node_map_type ConvTransposeConverter(
      ge::AttrValue::LIST_INT({strides[0], strides[1]}));
  conv_transpose_node->set_attr_kernel(
      ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]}));
-  lite::npu::OpList::Global().add(conv_transpose_node);
-  // append add node to add bias if has bias
+  // Append add node to add bias if exists bias
  std::shared_ptr<ge::Operator> output_node = conv_transpose_node;
-  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
+  if (HasInputArg(op_info, scope, "Bias")) {
-    // create bias node
+    // Create bias node
    auto bias_var_name = op_info->Input("Bias").front();
-    CHECK(!inputs_map.count(bias_var_name));
+    CHECK(!graph->HasNode(bias_var_name));
-    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
    auto channel_size = bias->dims().production();
    CHECK_EQ(channel_size, filter_shape[1] * groups);
-    auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
+    auto bias_const_node =
-    bias_const_node->set_attr_value(
+        graph->AddNode(bias_var_name, *bias, {1, channel_size, 1, 1});
-        lite::npu::CvtTensor(bias, {1, channel_size, 1, 1}));
+    // Append add node to add bias node
-    lite::npu::OpList::Global().add(bias_const_node);
+    auto add_node = graph->AddNode<ge::op::Add>(output_var_name);
-    // append add node to add bias node
-    auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
    add_node->set_input_x1(*conv_transpose_node);
    add_node->set_input_x2(*bias_const_node);
-    lite::npu::OpList::Global().add(add_node);
    output_node = add_node;
  }
-  node_map_type outputs_map;
  if (fuse_relu) {
-    // append relu node if fuse_relu is true
+    // Append relu node if fuse_relu is true
-    auto relu_node =
+    auto relu_node = graph->AddNode<ge::op::Activation>(output_var_name);
-        std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
    relu_node->set_input_x(*output_node);
-    relu_node->set_attr_mode(lite::npu::CvtActMode("relu"));
+    relu_node->set_attr_mode(CvtActMode("relu"));
-    lite::npu::OpList::Global().add(relu_node);
-    outputs_map[op_info->Output("Output").front()] = relu_node;
-  } else {
-    outputs_map[op_info->Output("Output").front()] = output_node;
  }
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-    conv2d_transpose,
+                         conv2d_transpose,
-    paddle::lite::kernels::npu::bridges::ConvTransposeConverter);
+                         paddle::lite::subgraph::npu::ConvTransposeConverter);
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
  auto x_dims = x.dims();
-  CHECK_EQ(x_dims.size(), 4UL) << "[NPU] only support 4-dimension x";
+  CHECK_EQ(x_dims.size(), 4UL) << "[NPU] Only support 4-dimension x";
  auto y_dims = y->dims();
  CHECK_GE(x_dims.size(), y_dims.size());
@@ -45,93 +45,86 @@ std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
  return y_new_shape;
 }
-node_map_type ElementwiseConverter(
+int ElementwiseConverter(void* ctx, OpLite* op) {
-    const std::shared_ptr<lite::OpLite> elementwise_op,
+  CHECK(ctx != nullptr);
-    const node_map_type& inputs_map) {
+  CHECK(op != nullptr);
-  auto scope = elementwise_op->scope();
+  auto graph = static_cast<Graph*>(ctx);
-  auto op_info = elementwise_op->op_info();
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
  auto x_var_name = op_info->Input("X").front();
  auto y_var_name = op_info->Input("Y").front();
-  CHECK(inputs_map.find(x_var_name) != inputs_map.end());
+  auto out_var_name = op_info->Output("Out").front();
  auto axis = op_info->GetAttr<int>("axis");
  std::shared_ptr<ge::Operator> elementwise_node = nullptr;
-  std::shared_ptr<ge::Operator> x_node = inputs_map.at(x_var_name);
+  std::shared_ptr<ge::Operator> x_node = graph->GetNode(x_var_name);
  std::shared_ptr<ge::Operator> y_node = nullptr;
-  if (inputs_map.find(y_var_name) != inputs_map.end()) {
+  if (graph->HasNode(y_var_name)) {
-    y_node = inputs_map.at(y_var_name);
+    y_node = graph->GetNode(y_var_name);
  } else {
-    auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
    auto x = scope->FindTensor(x_var_name);
    auto y = scope->FindMutableTensor(y_var_name);
    auto y_new_shape = CvtYShape(*x, y, axis);
-    y_const_node->set_attr_value(lite::npu::CvtTensor(y, y_new_shape));
+    y_node = graph->AddNode(y_var_name, y, y_new_shape);
-    y_node = y_const_node;
  }
-  lite::npu::OpList::Global().add(x_node);
-  lite::npu::OpList::Global().add(y_node);
  if (op_type == "elementwise_add" ||
      op_type == "fusion_elementwise_add_activation") {
-    auto elt_node = std::make_shared<ge::op::Add>(unique_op_type);
+    auto elt_node = graph->AddNode<ge::op::Add>(out_var_name);
    elt_node->set_input_x1(*x_node);
    elt_node->set_input_x2(*y_node);
    elementwise_node = elt_node;
  } else if (op_type == "elementwise_sub") {
-    auto elt_node = std::make_shared<ge::op::Sub>(unique_op_type);
+    auto elt_node = graph->AddNode<ge::op::Sub>(out_var_name);
    elt_node->set_input_x1(*x_node);
    elt_node->set_input_x2(*y_node);
    elementwise_node = elt_node;
  } else if (op_type == "elementwise_mul") {
-    auto elt_node = std::make_shared<ge::op::Mul>(unique_op_type);
+    auto elt_node = graph->AddNode<ge::op::Mul>(out_var_name);
    elt_node->set_input_x(*x_node);
    elt_node->set_input_y(*y_node);
    elementwise_node = elt_node;
  } else if (op_type == "elementwise_div") {
-    auto elt_node = std::make_shared<ge::op::RealDiv>(unique_op_type);
+    auto elt_node = graph->AddNode<ge::op::RealDiv>(out_var_name);
    elt_node->set_input_x1(*x_node);
    elt_node->set_input_x2(*y_node);
    elementwise_node = elt_node;
  } else {
-    LOG(FATAL) << "unsupported op type: " << op_type;
+    LOG(WARNING) << "[NPU] Unsupported op type: " << op_type;
+    return FAILED;
  }
-  lite::npu::OpList::Global().add(elementwise_node);
-  node_map_type outputs_map;
  if (op_type == "fusion_elementwise_add_activation") {
    auto act_type = op_info->GetAttr<std::string>("act_type");
-    auto act_node =
+    auto act_node = graph->AddNode<ge::op::Activation>(out_var_name);
-        std::make_shared<ge::op::Activation>(unique_op_type + "/act");
    act_node->set_input_x(*elementwise_node);
    // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
    // clipped_relu etc.
-    act_node->set_attr_mode(lite::npu::CvtActMode(act_type));
+    act_node->set_attr_mode(CvtActMode(act_type));
-    lite::npu::OpList::Global().add(act_node);
-    outputs_map[op_info->Output("Out").front()] = act_node;
-  } else {
-    outputs_map[op_info->Output("Out").front()] = elementwise_node;
  }
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(elementwise_add,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+                         elementwise_add,
-REGISTER_NPU_BRIDGE(fusion_elementwise_add_activation,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-REGISTER_NPU_BRIDGE(elementwise_sub,
+                         fusion_elementwise_add_activation,
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
-REGISTER_NPU_BRIDGE(elementwise_mul,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+                         elementwise_sub,
-REGISTER_NPU_BRIDGE(elementwise_div,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         elementwise_mul,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         elementwise_div,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/npu/bridges/engine.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+int Engine::BuildDeviceProgram() { return FAILED; }
+int Engine::LaunchDeviceProgram() { return 0; }
+int Engine::BuildOriginProgram() {
+  // TODO(hong19860320) The block_desc need to be divided into subgraphs during
+  // the exection time. But only see them as a subgraph now.
+  origin_program_.clear();
+  for (int op_idx = 0; op_idx < block_desc_->OpsSize(); op_idx++) {
+    auto op_desc = block_desc_->GetOp<cpp::OpDesc>(op_idx);
+    CHECK(op_desc);
+    std::string op_type = op_desc->Type();
+    auto op = LiteOpRegistry::Global().Create(op_desc->Type());
+    op->Attach(*op_desc, scope_);
+    std::unique_ptr<KernelBase> picked_kernel;
+    if (op_desc->HasAttr(kKernelTypeAttr)) {
+      // Create op and pick up kernel according to the kKernelTypeAttr attribute
+      auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr);
+      std::string alias;
+      Place place;
+      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
+      VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type
+              << " for " << op_type;
+      auto kernels = op->CreateKernels({place});
+      CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
+      auto it = std::find_if(
+          kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
+            return it->alias() == alias;
+          });
+      CHECK(it != kernels.end());
+      picked_kernel = std::move(*it);
+    } else {
+      VLOG(3) << "The attr '" << kKernelTypeAttr
+              << "' not found, pick the first kernel for " << op_type;
+#if defined(LITE_WITH_ARM)
+      auto kernels = op->CreateKernels({Place{TARGET(kARM)}});
+#elif defined(LITE_WITH_X86)
+      auto kernels = op->CreateKernels({Place{TARGET(kX86)}});
+#endif
+      CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
+      picked_kernel = std::move(kernels.front());
+    }
+    picked_kernel->SetContext(
+        ContextScheduler::Global().NewContext(picked_kernel->target()));
+    origin_program_.emplace_back(std::move(op), std::move(picked_kernel));
+  }
+  return 0;
+}
+int Engine::LaunchOriginProgram() {
+  for (auto& inst : origin_program_) {
+    auto op_type = inst.op()->op_info()->Type();
+    if (op_type == "feed" || op_type == "fetch") continue;
+    inst.Run();
+  }
+  return 0;
+}
+int Engine::Build() {
+  // In order to attach all of the ops of the block desc, we need to build the
+  // original program firstly.
+  BuildOriginProgram();
+  // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
+  build_device_program_status_ = BuildDeviceProgram();
+  return build_device_program_status_;
+}
+bool Engine::InputShapeChanged() {
+  for (int i = 0; i < origin_itensors_.size(); i++) {
+    if (origin_itensors_[i]->dims() != origin_idims_[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+int Engine::Launch() {
+  // Rebuild device program when the shapes of input tensors have been changed.
+  if (CHECK_SUCCESS(build_device_program_status_) &&
+      CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) &&
+      InputShapeChanged()) {
+    Build();
+  }
+  if (CHECK_FAILED(build_device_program_status_)) {
+    LaunchOriginProgram();
+  } else {
+    LaunchDeviceProgram();
+  }
+  return 0;
+}
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.h
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass.h
@@ -14,52 +14,63 @@
 #pragma once
-#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
-#include "lite/backends/xpu/builder.h"
+#include "lite/core/op_lite.h"
-#include "lite/core/mir/pass.h"
+#include "lite/core/program.h"
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
+#include "lite/core/tensor.h"
-#include "lite/kernels/xpu/bridges/registry.h"
 namespace paddle {
 namespace lite {
-namespace mir {
 namespace subgraph {
-class GenerateXPUProgramPass : public SubgraphProgramPass {
+class Engine {
 public:
-  using key2nodes_t = std::map<std::string, Node*>;
+  Engine(int block_idx,
+         cpp::BlockDesc *block_desc,
+         const std::vector<std::string> &input_names,
+         const std::vector<std::string> &output_names,
+         lite::Scope *scope)
+      : block_idx_(block_idx),
+        block_desc_(block_desc),
+        input_names_(input_names),
+        output_names_(output_names),
+        scope_(scope) {}
+  virtual ~Engine() = default;
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+  virtual int Build();
+  virtual int Launch();
+ private:
+  Engine(const Engine &) = delete;
 protected:
-  // nodes2cvt: op nodes to convert
+  virtual int BuildDeviceProgram();
-  // return cvted_vars: converted var nodes
+  virtual int LaunchDeviceProgram();
-  void CvtAllOpNodes(
-      const std::vector<Node*>& op_nodes,
-      lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
-      lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes);
-  std::shared_ptr<xtcl::xExpr> CvtVarNode(
+  virtual int BuildOriginProgram();
-      lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
+  virtual int LaunchOriginProgram();
-      lite::mir::Node* var_node,
-      const Scope* scope);
-  std::string BuildXPUGraph(const std::unordered_set<Node*>& op_nodes,
+  virtual bool InputShapeChanged();
-                            const std::unordered_set<Node*>& in_data_vars,
-                            const std::unordered_set<Node*>& out_data_vars,
-                            int sub_id);
-  void GenXPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
+  int block_idx_;
-                      const std::unordered_set<Node*>& op_nodes,
+  cpp::BlockDesc *block_desc_;
-                      int sub_id);
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
+  Scope *scope_{nullptr};
+  // SUCCESS: device program build successed. FAILED: device program build
+  // failed. REBUILD_WHEN_SHAPE_CHANGED: device program build successed but need
+  // to rebuild when input shape changed.
+  int build_device_program_status_{0};
+  std::vector<DDim> origin_idims_;
+  std::vector<DDim> origin_odims_;
+  std::vector<Tensor *> origin_itensors_;
+  std::vector<Tensor *> origin_otensors_;
+  std::vector<Instruction> origin_program_;
 };
 }  // namespace subgraph
-}  // namespace mir
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
@@ -12,31 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op,
+int FCConverter(void* ctx, OpLite* op) {
-                          const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  auto scope = fc_op->scope();
+  CHECK(op != nullptr);
-  auto op_info = fc_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  auto fc_node = std::make_shared<ge::op::FullConnection>(unique_op_type);
  auto x_var_name = op_info->Input("Input").front();
  auto w_var_name = op_info->Input("W").front();
+  auto out_var_name = op_info->Output("Out").front();
  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
-  auto w = scope->FindVar(w_var_name)->GetMutable<lite::Tensor>();
+  auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
  auto x_dims = x->dims();
  auto w_dims = w->dims();
@@ -50,71 +50,54 @@ node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op,
  VLOG(3) << "[NPU] x dims: " << x_dims << " w dims: " << w_dims << " m: " << m
          << " k: " << k << " n: " << n;
-  CHECK(inputs_map.count(x_var_name));
+  auto fc_node = graph->AddNode<ge::op::FullConnection>(out_var_name + "/fc");
-  CHECK(!inputs_map.count(w_var_name));
+  CHECK(!graph->HasNode(w_var_name));
-  // reshape x to (m, k, 1, 1)
+  // Reshape x to (m, k, 1, 1)
  auto reshaped_x_node =
-      std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
+      graph->AddNode<ge::op::Reshape>(x_var_name + "/reshape");
-  reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name));
+  reshaped_x_node->set_input_tensor(*graph->GetNode(x_var_name));
  reshaped_x_node->set_attr_shape({m, k, 1, 1});
  reshaped_x_node->set_attr_axis(0);
  fc_node->set_input_x(*reshaped_x_node);
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(reshaped_x_node);
-  // create w const node, set its shape to (k, n, 1, 1) and fill with
+  // Create w const node, set its shape to (n, k, 1, 1) and fill with
  // the transposed w tensor
-  auto w_const_node = std::make_shared<ge::op::Const>(w_var_name);
+  Tensor transpose_w;
-  ge::TensorDesc w_const_desc(
+  transpose_w.Resize({n, k, 1, 1});
-      ge::Shape({n, k, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT);
+  auto transpose_w_data = transpose_w.mutable_data<float>();
-  ge::TensorPtr w_const_tensor = std::make_shared<ge::Tensor>();
-  w_const_tensor->SetTensorDesc(w_const_desc);
  auto w_data = w->mutable_data<float>();
-  std::vector<float> transposed_w_data(w_dims.production());
  for (int i = 0; i < k; i++) {
    for (int j = 0; j < n; j++) {
-      transposed_w_data[j * k + i] = w_data[i * n + j];
+      transpose_w_data[j * k + i] = w_data[i * n + j];
    }
  }
-  w_const_tensor->SetData(reinterpret_cast<uint8_t*>(transposed_w_data.data()),
+  auto w_const_node = graph->AddNode(w_var_name, transpose_w);
-                          transposed_w_data.size() * sizeof(float));
-  w_const_node->set_attr_value(w_const_tensor);
  fc_node->set_input_w(*w_const_node);
-  lite::npu::OpList::Global().add(w_const_node);
-  // add bias node if bias tensor exists
+  // Add bias node if bias tensor exists
-  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
+  if (HasInputArg(op_info, scope, "Bias")) {
    auto bias_var_name = op_info->Input("Bias").front();
    auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
    auto bias_dims = bias->dims();
-    CHECK(!inputs_map.count(bias_var_name));
+    CHECK(!graph->HasNode(bias_var_name));
    CHECK_EQ(bias_dims.production(), n);
-    auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
+    auto bias_const_node = graph->AddNode(bias_var_name, *bias, {1, n, 1, 1});
-    bias_const_node->set_attr_value(lite::npu::CvtTensor(bias, {1, n, 1, 1}));
    fc_node->set_input_b(*bias_const_node);
-    lite::npu::OpList::Global().add(bias_const_node);
  }
-  lite::npu::OpList::Global().add(fc_node);
-  // reshape output of fc_node from (m, n, 1, 1) to (m, n)
+  // Reshape output of fc_node from (m, n, 1, 1) to (m, n)
-  auto reshaped_fc_node =
+  auto reshaped_fc_node = graph->AddNode<ge::op::Reshape>(out_var_name);
-      std::make_shared<ge::op::Reshape>(unique_op_type + "_reshape");
  reshaped_fc_node->set_input_tensor(*fc_node);
  reshaped_fc_node->set_attr_shape({m, n});
  reshaped_fc_node->set_attr_axis(0);
-  lite::npu::OpList::Global().add(reshaped_fc_node);
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = reshaped_fc_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(fc, paddle::lite::kernels::npu::bridges::FCConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, fc, paddle::lite::subgraph::npu::FCConverter);
--- a/lite/kernels/npu/bridges/graph.cc
+++ b/lite/kernels/npu/bridges/graph.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/npu/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/npu/bridges/utility.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+// Const node
+std::shared_ptr<ge::op::Const> Graph::AddNode(const std::string& name,
+                                              const Tensor& tensor,
+                                              PrecisionType ptype,
+                                              DataLayoutType ltype) {
+  return AddNode(name, tensor, tensor.dims().Vectorize(), ptype, ltype);
+}
+std::shared_ptr<ge::op::Const> Graph::AddNode(const std::string& name,
+                                              const Tensor& tensor,
+                                              std::vector<int64_t> shape,
+                                              PrecisionType ptype,
+                                              DataLayoutType ltype) {
+  CHECK(!HasNode(name)) << "Node " << name << " redefined.";
+  auto node = AddNode<ge::op::Const>(name);
+  node->set_attr_value(CvtTensor(tensor, shape, ptype, ltype));
+  return node;
+}
+// Data node
+std::shared_ptr<ge::op::Data> Graph::AddNode(const std::string& name,
+                                             std::vector<int64_t> shape,
+                                             PrecisionType ptype,
+                                             DataLayoutType ltype) {
+  CHECK(!HasNode(name)) << "Node " << name << " redefined.";
+  auto node = AddNode<ge::op::Data>(name);
+  ge::TensorDesc desc(
+      ge::Shape(shape), CvtDataLayoutType(ltype), CvtPrecisionType(ptype));
+  node->update_input_desc_x(desc);
+  nodes_.insert(std::make_pair(name, node));
+  return node;
+}
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "ai_ddk_lib/include/graph/op/all_ops.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+// Type and registers of converters for converting Paddle Ops to HiAI IR graph
+class Graph {
+ public:
+  template <typename T>
+  std::shared_ptr<T> AddNode(const std::string& name) {
+    auto unique_name = [&](const std::string& key) {
+      int idx = 1;
+      auto it = counts_.find(key);
+      if (it == counts_.end()) {
+        counts_.insert(std::make_pair(key, idx));
+      } else {
+        idx = ++(it->second);
+      }
+      return key + "_" + std::to_string(idx);
+    };
+    auto it = nodes_.find(name);
+    if (it != nodes_.end()) {
+      // Generate a new unique name as the key to bind the origin node:
+      // new_name->node
+      nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second));
+      nodes_.erase(it);
+    }
+    // Create a new node and bind with the name: name->new_node
+    auto node = std::make_shared<T>(unique_name(name + "_op"));
+    nodes_.insert(std::make_pair(name, node));
+    return node;
+  }
+  // Const node
+  std::shared_ptr<ge::op::Const> AddNode(
+      const std::string& name,
+      const Tensor& tensor,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+  std::shared_ptr<ge::op::Const> AddNode(
+      const std::string& name,
+      const Tensor& tensor,
+      std::vector<int64_t> shape,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+  template <typename T>
+  std::shared_ptr<ge::op::Const> AddNode(
+      const std::string& name,
+      const std::vector<T>& data,
+      std::vector<int64_t> shape = {},
+      DataLayoutType ltype = DATALAYOUT(kNCHW)) {
+    const std::type_info& info = typeid(T);
+    PrecisionType ptype = PRECISION(kFloat);
+    if (info == typeid(float)) {
+      ptype = PRECISION(kFloat);
+    } else if (info == typeid(int8_t)) {
+      ptype = PRECISION(kFloat);
+    } else if (info == typeid(int32_t)) {
+      ptype = PRECISION(kInt32);
+    } else {
+      LOG(FATAL) << "[NPU] Unknow data type " << info.name();
+    }
+    if (shape.empty()) {
+      shape = {static_cast<int64_t>(data.size())};
+    } else {
+      int size = 1;
+      for (auto i : shape) {
+        size *= i;
+      }
+      CHECK_EQ(data.size(), size);
+    }
+    Tensor tensor;
+    tensor.Resize(shape);
+    std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
+                reinterpret_cast<const uint8_t*>(data.data()),
+                data.size() * sizeof(T));
+    return AddNode(name, tensor, ptype, ltype);
+  }
+  template <typename T>
+  std::shared_ptr<ge::op::Const> AddNode(
+      const std::string& name,
+      T value,
+      std::vector<int64_t> shape = {1},
+      DataLayoutType ltype = DATALAYOUT(kNCHW)) {
+    int64_t size = 1;
+    for (auto i : shape) {
+      size *= i;
+    }
+    std::vector<T> data(size, value);
+    return AddNode(name, data, shape, ltype);
+  }
+  // Data node
+  std::shared_ptr<ge::op::Data> AddNode(
+      const std::string& name,
+      std::vector<int64_t> shape,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+  std::shared_ptr<ge::Operator> GetNode(std::string name) {
+    CHECK(HasNode(name)) << "[NPU] Node " << name << " not found.";
+    return nodes_.at(name);
+  }
+  bool HasNode(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+ private:
+  std::unordered_map<std::string, std::shared_ptr<ge::Operator>> nodes_;
+  std::unordered_map<std::string, int> counts_;
+};
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
@@ -12,34 +12,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type InterpolateConverter(
+int InterpolateConverter(void* ctx, OpLite* op) {
-    const std::shared_ptr<lite::OpLite> interpolate_op,
+  CHECK(ctx != nullptr);
-    const node_map_type& inputs_map) {
+  CHECK(op != nullptr);
-  auto scope = interpolate_op->scope();
+  auto graph = static_cast<Graph*>(ctx);
-  auto op_info = interpolate_op->op_info();
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  // get input, output and attributes from lite op
+  // Get input, output and attributes from lite op
  auto x_var_name = op_info->Input("X").front();
-  CHECK(inputs_map.count(x_var_name));
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
  auto x_dims = x->dims();
  auto x_h = x_dims[2];
  auto x_w = x_dims[3];
  CHECK_EQ(x_dims.size(), 4);
+  auto out_var_name = op_info->Output("Out").front();
  auto scale = op_info->GetAttr<float>("scale");
  auto out_w = op_info->GetAttr<int>("out_w");
  auto out_h = op_info->GetAttr<int>("out_h");
@@ -50,7 +48,7 @@ node_map_type InterpolateConverter(
                                                 "align_corners = false isn't "
                                                 "supported in HiAI DDK";
-  // priority: OutSize > scale > out_h/out_w
+  // Priority: OutSize > scale > out_h/out_w
  if (scale > 0) {
    out_h = static_cast<int>(x_h * scale);
    out_w = static_cast<int>(x_w * scale);
@@ -58,18 +56,17 @@ node_map_type InterpolateConverter(
    out_w = out_w > 0 ? out_w : -1;
  }
-  // update out_h and out_w if has OutSize
+  // Update out_h and out_w if has OutSize
  std::shared_ptr<ge::Operator> out_size_node = nullptr;
-  if (lite::npu::HasInputArg(op_info, scope, "OutSize")) {
+  if (HasInputArg(op_info, scope, "OutSize")) {
    auto out_size_var_name = op_info->Input("OutSize").front();
-    if (inputs_map.count(out_size_var_name)) {
+    if (graph->HasNode(out_size_var_name)) {
-      out_size_node = inputs_map.at(out_size_var_name);
+      out_size_node = graph->GetNode(out_size_var_name);
    } else {
-      auto out_size =
+      auto out_size = scope->FindVar(out_size_var_name)->GetMutable<Tensor>();
-          scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
      CHECK_EQ(out_size->numel(), 2);
      auto out_size_data = out_size->mutable_data<int>();
-      // update out_h and out_w if has OutSize
+      // Update out_h and out_w if has OutSize
      out_h = out_size_data[0];
      out_w = out_size_data[1];
    }
@@ -83,46 +80,37 @@ node_map_type InterpolateConverter(
          << " is too large, should not exceed " << largest_multiple
          << " in HiAI DDK";
    }
-    auto out_size_const_node =
+    out_size_node = graph->AddNode(out_var_name + "/out_size",
-        std::make_shared<ge::op::Const>(unique_op_type + "/out_size");
+                                   std::vector<int>({out_h, out_w}));
-    out_size_const_node->set_attr_value(
-        lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
-    out_size_node = out_size_const_node;
  }
-  lite::npu::OpList::Global().add(out_size_node);
-  std::shared_ptr<ge::Operator> interp_node = nullptr;
  if (interp_method == "bilinear") {
    auto bilinear_interp_node =
-        std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
+        graph->AddNode<ge::op::ResizeBilinear>(out_var_name);
-    bilinear_interp_node->set_input_x(*inputs_map.at(x_var_name));
+    bilinear_interp_node->set_input_x(*graph->GetNode(x_var_name));
    bilinear_interp_node->set_input_size(*out_size_node);
    bilinear_interp_node->set_attr_align_corners(align_corners);
-    interp_node = bilinear_interp_node;
  } else if (interp_method == "nearest") {
    auto nearest_interp_node =
-        std::make_shared<ge::op::ResizeNearestNeighbor>(unique_op_type);
+        graph->AddNode<ge::op::ResizeNearestNeighbor>(out_var_name);
-    nearest_interp_node->set_input_image(*inputs_map.at(x_var_name));
+    nearest_interp_node->set_input_image(*graph->GetNode(x_var_name));
    nearest_interp_node->set_input_size(*out_size_node);
    nearest_interp_node->set_attr_align_corners(align_corners);
-    interp_node = nearest_interp_node;
  } else {
-    LOG(FATAL) << "[NPU] Unsupported interpolate method: " << interp_method;
+    LOG(WARNING) << "[NPU] Unsupported interpolate method: " << interp_method;
+    return FAILED;
  }
-  lite::npu::OpList::Global().add(interp_node);
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = interp_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(bilinear_interp,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::InterpolateConverter);
+                         bilinear_interp,
-REGISTER_NPU_BRIDGE(nearest_interp,
+                         paddle::lite::subgraph::npu::InterpolateConverter);
-                    paddle::lite::kernels::npu::bridges::InterpolateConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         nearest_interp,
+                         paddle::lite::subgraph::npu::InterpolateConverter);
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
@@ -12,24 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-// Note: inputs_map the var_name contains only the data, the weight should be
+// Note: all of the input weight vars should be handled in this converter
-// handle in this converter
+int MulConverter(void* ctx, OpLite* op) {
-node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
+  CHECK(ctx != nullptr);
-                           const node_map_type& inputs_map) {
+  CHECK(op != nullptr);
-  auto scope = mul_op->scope();
+  auto graph = static_cast<Graph*>(ctx);
-  auto op_info = mul_op->op_info();
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
  auto x_var_name = op_info->Input("X").front();
  auto y_var_name = op_info->Input("Y").front();
@@ -37,6 +37,7 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
  auto x_dims = x->dims();
  auto y_dims = y->dims();
+  auto out_var_name = op_info->Output("Out").front();
  int x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
  int y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims");
  int m = x_dims.Slice(0, x_num_col_dims).production();
@@ -44,61 +45,47 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
  CHECK_EQ(k, y_dims.Slice(0, y_num_col_dims).production())
      << "[NPU] columns of X must be equal with rows of Y";
  int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production();
-  LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k;
+  VLOG(3) << "m:" << m << ",n:" << n << ",k:" << k;
-  LOG(INFO) << "x_var_name:" << x_var_name
+  VLOG(3) << "x_var_name:" << x_var_name
-            << ", is data: " << inputs_map.count(x_var_name);
+          << ", is data: " << graph->HasNode(x_var_name);
-  LOG(INFO) << "y_var_name:" << y_var_name
+  VLOG(3) << "y_var_name:" << y_var_name
-            << ", is data: " << inputs_map.count(y_var_name);
+          << ", is data: " << graph->HasNode(y_var_name);
-  CHECK(inputs_map.count(x_var_name))
+  CHECK(graph->HasNode(x_var_name))
      << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet.";
-  auto mul_node = std::make_shared<ge::op::MatMul>(unique_op_type);
+  auto mul_node = graph->AddNode<ge::op::MatMul>(out_var_name);
-  // add input x node which supports persistable and non-persistable tensor, and
+  // Add input x node which supports persistable and non-persistable tensor, and
  // reshape to (m, k)
-  if (inputs_map.count(x_var_name)) {
+  if (graph->HasNode(x_var_name)) {
    auto reshaped_x_node =
-        std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
+        graph->AddNode<ge::op::Reshape>(x_var_name + "/reshape");
-    reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name));
+    reshaped_x_node->set_input_tensor(*graph->GetNode(x_var_name));
    reshaped_x_node->set_attr_shape({m, k});
    reshaped_x_node->set_attr_axis(0);
    mul_node->set_input_x1(*reshaped_x_node);
-    lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-    lite::npu::OpList::Global().add(reshaped_x_node);
  } else {
-    auto x_const_node = std::make_shared<ge::op::Const>(x_var_name);
+    auto x_const_node = graph->AddNode(x_var_name, *x, {m, k});
-    x_const_node->set_attr_value(lite::npu::CvtTensor(x, {m, k}));
    mul_node->set_input_x1(*x_const_node);
-    lite::npu::OpList::Global().add(x_const_node);
  }
-  // add input y node which only supports persistable tensor, and reshape to (k,
+  // Add input y node which only supports persistable tensor, and reshape to
-  // n)
+  // (k,n)
-  if (inputs_map.count(y_var_name)) {
+  if (graph->HasNode(y_var_name)) {
    auto reshaped_y_node =
-        std::make_shared<ge::op::Reshape>(y_var_name + "_reshape");
+        graph->AddNode<ge::op::Reshape>(y_var_name + "/reshape");
-    reshaped_y_node->set_input_tensor(*inputs_map.at(y_var_name));
+    reshaped_y_node->set_input_tensor(*graph->GetNode(y_var_name));
    reshaped_y_node->set_attr_shape({k, n});
    reshaped_y_node->set_attr_axis(0);
    mul_node->set_input_x2(*reshaped_y_node);
-    lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
-    lite::npu::OpList::Global().add(reshaped_y_node);
  } else {
-    auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
+    auto y_const_node = graph->AddNode(y_var_name, *y, {k, n});
-    y_const_node->set_attr_value(lite::npu::CvtTensor(y, {k, n}));
    mul_node->set_input_x2(*y_const_node);
-    lite::npu::OpList::Global().add(y_const_node);
  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  lite::npu::OpList::Global().add(mul_node);
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = mul_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(mul, paddle::lite::kernels::npu::bridges::MulConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, mul, paddle::lite::subgraph::npu::MulConverter);
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
@@ -12,38 +12,39 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op,
+int Pad2dConverter(void* ctx, OpLite* op) {
-                             const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  auto scope = pad2d_op->scope();
+  CHECK(op != nullptr);
-  auto op_info = pad2d_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  std::shared_ptr<ge::op::Pad> pad2d_node =
-      std::make_shared<ge::op::Pad>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
-  pad2d_node->set_input_x(*inputs_map.at(x_var_name));
+  auto out_var_name = op_info->Output("Out").front();
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  auto pad2d_node = graph->AddNode<ge::op::Pad>(out_var_name);
-  lite::npu::OpList::Global().add(pad2d_node);
+  pad2d_node->set_input_x(*graph->GetNode(x_var_name));
  auto mode = op_info->GetAttr<std::string>("mode");
  if (mode == "constant") {
    pad2d_node->set_attr_mode(0);
  } else if (mode == "reflect") {
-    LOG(FATAL) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
+    LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
    pad2d_node->set_attr_mode(1);
+    return FAILED;
  } else {
-    LOG(FATAL) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
+    LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
+    return FAILED;
  }
  auto x_dims = scope->FindTensor(x_var_name)->dims();
@@ -51,34 +52,25 @@ node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op,
  CHECK_EQ(padding.size(), 4);
  int xds = x_dims.size();
  padding.insert(padding.begin(), xds * 2 - 4, 0);
-  auto npu_padding =
+  auto padding_const_node =
-      std::make_shared<ge::op::Const>(unique_op_type + "/padding");
+      graph->AddNode(out_var_name + "/padding", padding, {xds, 2});
-  npu_padding->set_attr_value(
+  pad2d_node->set_input_padding(*padding_const_node);
-      lite::npu::CreateTensorAndFillData<int>(padding, {xds, 2}));
-  pad2d_node->set_input_padding(*npu_padding);
-  lite::npu::OpList::Global().add(npu_padding);
  if (mode == "constant") {
    auto pad_value = op_info->GetAttr<float>("pad_value");
-    auto npu_pad_value =
+    auto pad_value_const_node =
-        std::make_shared<ge::op::Const>(unique_op_type + "/pad_value");
+        graph->AddNode(out_var_name + "/pad_value", pad_value);
-    npu_pad_value->set_attr_value(
+    pad2d_node->set_input_constant_values(*pad_value_const_node);
-        lite::npu::CreateTensorAndFillData<float>({pad_value}));
-    pad2d_node->set_input_constant_values(*npu_pad_value);
-    lite::npu::OpList::Global().add(npu_pad_value);
    pad2d_node->set_attr_T(0);  // type of pad_value:  0:float  3:int32
  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = pad2d_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(pad2d, paddle::lite::kernels::npu::bridges::Pad2dConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         pad2d,
+                         paddle::lite::subgraph::npu::Pad2dConverter);
--- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
@@ -14,46 +14,40 @@
 #pragma once
-#include "lite/kernels/npu/bridges/registry.h"
+USE_SUBGRAPH_BRIDGE(NPU, sigmoid);
+USE_SUBGRAPH_BRIDGE(NPU, relu);
+USE_SUBGRAPH_BRIDGE(NPU, tanh);
+USE_SUBGRAPH_BRIDGE(NPU, relu_clipped);
+USE_SUBGRAPH_BRIDGE(NPU, leaky_relu);
+USE_SUBGRAPH_BRIDGE(NPU, softsign);
+USE_SUBGRAPH_BRIDGE(NPU, hard_sigmoid);
-USE_NPU_BRIDGE(sigmoid);
+USE_SUBGRAPH_BRIDGE(NPU, batch_norm);
-USE_NPU_BRIDGE(relu);
+USE_SUBGRAPH_BRIDGE(NPU, concat);
-USE_NPU_BRIDGE(tanh);
+USE_SUBGRAPH_BRIDGE(NPU, conv2d);
-USE_NPU_BRIDGE(relu_clipped);
+USE_SUBGRAPH_BRIDGE(NPU, depthwise_conv2d);
-USE_NPU_BRIDGE(relu6);
+USE_SUBGRAPH_BRIDGE(NPU, conv2d_transpose);
-USE_NPU_BRIDGE(leaky_relu);
-USE_NPU_BRIDGE(softsign);
-USE_NPU_BRIDGE(hard_sigmoid);
-USE_NPU_BRIDGE(arg_max);
+USE_SUBGRAPH_BRIDGE(NPU, elementwise_add);
-USE_NPU_BRIDGE(batch_norm);
+USE_SUBGRAPH_BRIDGE(NPU, fusion_elementwise_add_activation);
-USE_NPU_BRIDGE(concat);
+USE_SUBGRAPH_BRIDGE(NPU, elementwise_sub);
-USE_NPU_BRIDGE(conv2d);
+USE_SUBGRAPH_BRIDGE(NPU, elementwise_mul);
-USE_NPU_BRIDGE(depthwise_conv2d);
+USE_SUBGRAPH_BRIDGE(NPU, elementwise_div);
-USE_NPU_BRIDGE(conv2d_transpose);
-USE_NPU_BRIDGE(elementwise_add);
+USE_SUBGRAPH_BRIDGE(NPU, fc);
-USE_NPU_BRIDGE(fusion_elementwise_add_activation);
+USE_SUBGRAPH_BRIDGE(NPU, bilinear_interp);
-USE_NPU_BRIDGE(elementwise_sub);
+USE_SUBGRAPH_BRIDGE(NPU, nearest_interp);
-USE_NPU_BRIDGE(elementwise_mul);
+USE_SUBGRAPH_BRIDGE(NPU, mul);
-USE_NPU_BRIDGE(elementwise_div);
+USE_SUBGRAPH_BRIDGE(NPU, pad2d);
+USE_SUBGRAPH_BRIDGE(NPU, pool2d);
-USE_NPU_BRIDGE(fc);
+USE_SUBGRAPH_BRIDGE(NPU, reduce_mean);
-USE_NPU_BRIDGE(bilinear_interp);
+USE_SUBGRAPH_BRIDGE(NPU, reshape);
-USE_NPU_BRIDGE(nearest_interp);
+USE_SUBGRAPH_BRIDGE(NPU, reshape2);
-USE_NPU_BRIDGE(mul);
+USE_SUBGRAPH_BRIDGE(NPU, scale);
-USE_NPU_BRIDGE(pad2d);
+USE_SUBGRAPH_BRIDGE(NPU, shuffle_channel);
-USE_NPU_BRIDGE(pool2d);
+USE_SUBGRAPH_BRIDGE(NPU, softmax);
-USE_NPU_BRIDGE(reduce_mean);
+USE_SUBGRAPH_BRIDGE(NPU, split);
-USE_NPU_BRIDGE(reshape);
+USE_SUBGRAPH_BRIDGE(NPU, sqrt);
-USE_NPU_BRIDGE(reshape2);
+USE_SUBGRAPH_BRIDGE(NPU, square);
-USE_NPU_BRIDGE(scale);
+USE_SUBGRAPH_BRIDGE(NPU, transpose);
-USE_NPU_BRIDGE(shuffle_channel);
+USE_SUBGRAPH_BRIDGE(NPU, transpose2);
-USE_NPU_BRIDGE(softmax);
-USE_NPU_BRIDGE(split);
-USE_NPU_BRIDGE(sqrt);
-USE_NPU_BRIDGE(square);
-USE_NPU_BRIDGE(transpose);
-USE_NPU_BRIDGE(transpose2);
-USE_NPU_BRIDGE(unsqueeze);
-USE_NPU_BRIDGE(unsqueeze2);
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -13,30 +13,29 @@
 // limitations under the License.
 #include "lite/operators/pool_op.h"
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
+int PoolConverter(void* ctx, OpLite* op) {
-                            const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  auto scope = pool_op->scope();
+  CHECK(op != nullptr);
-  auto op_info = pool_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  std::shared_ptr<ge::op::Pooling> pool_node =
-      std::make_shared<ge::op::Pooling>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
  auto x = scope->FindTensor(x_var_name);
-  pool_node->set_input_x(*inputs_map.at(x_var_name));
+  auto out_var_name = op_info->Output("Out").front();
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  auto pool_node = graph->AddNode<ge::op::Pooling>(out_var_name);
-  lite::npu::OpList::Global().add(pool_node);
+  pool_node->set_input_x(*graph->GetNode(x_var_name));
  int mode = 0;
  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
@@ -47,7 +46,8 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
    CHECK(op_info->GetAttr<bool>("exclusive"))
        << "[NPU] exclusive must be true in HiAI DDK";
  } else {
-    LOG(FATAL) << "[NPU] Unsupported pooling type: " << pooling_type;
+    LOG(WARNING) << "[NPU] Unsupported pooling type: " << pooling_type;
+    return FAILED;
  }
  pool_node->set_attr_mode(mode);
@@ -67,8 +67,8 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
  pool_node->set_attr_global_pooling(global_pooling);
  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  auto window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end());
+  pool_node->set_attr_window(
-  pool_node->set_attr_window(window);
+      ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()));
  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
  if (paddings.size() == 2L) {
@@ -78,42 +78,38 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
    }
  }
  CHECK_EQ(paddings.size(), 4L)
-      << "Paddings size should be the same or twice as the inputs size.";
+      << "[NPU] Paddings size should be the same or twice as the inputs size.";
  bool adaptive = false;
  if (op_info->HasAttr("adaptive")) {
    adaptive = op_info->GetAttr<bool>("adaptive");
  }
  auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  operators::UpdatePadding(&paddings,
+  lite::operators::UpdatePadding(&paddings,
-                           global_pooling,
+                                 global_pooling,
-                           adaptive,
+                                 adaptive,
-                           padding_algorithm,
+                                 padding_algorithm,
-                           x->dims(),
+                                 x->dims(),
-                           strides,
+                                 strides,
-                           ksize);
+                                 ksize);
-  auto npu_pad = ge::AttrValue::LIST_INT{
+  pool_node->set_attr_pad(ge::AttrValue::LIST_INT{
-      paddings[0], paddings[1], paddings[2], paddings[3]};
+      paddings[0], paddings[1], paddings[2], paddings[3]});
-  pool_node->set_attr_pad(npu_pad);
+  pool_node->set_attr_stride(
+      ge::AttrValue::LIST_INT(strides.begin(), strides.end()));
-  auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end());
-  pool_node->set_attr_stride(npu_stride);
  int ceil_mode = 0;
  if (op_info->HasAttr("ceil_mode")) {
    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
  }
  pool_node->set_attr_ceil_mode(ceil_mode);
-  // output_node->set_attr_data_mode(npu_data_mode);
+  // pool_node->set_attr_data_mode(data_mode);
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = pool_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(pool2d, paddle::lite::kernels::npu::bridges::PoolConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         pool2d,
+                         paddle::lite::subgraph::npu::PoolConverter);
--- a/lite/kernels/npu/bridges/reduce_mean_op.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
@@ -12,30 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type ReduceMeanConverter(
+int ReduceMeanConverter(void* ctx, OpLite* op) {
-    const std::shared_ptr<lite::OpLite> reduce_mean_op,
+  CHECK(ctx != nullptr);
-    const node_map_type& inputs_map) {
+  CHECK(op != nullptr);
-  auto scope = reduce_mean_op->scope();
+  auto graph = static_cast<Graph*>(ctx);
-  auto op_info = reduce_mean_op->op_info();
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  // get input, and op attributes
+  // Get input and op attributes
  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Input("Out").front();
  auto x_dims = scope->FindTensor(x_var_name)->dims();
  auto keep_dim = op_info->GetAttr<bool>("keep_dim");
  auto dim = op_info->GetAttr<std::vector<int>>("dim");
-  CHECK(!dim.empty()) << "\"dim\" of reduce_mean should not be empty.";
+  CHECK(!dim.empty()) << "[NPU] \"dim\" of reduce_mean should not be empty.";
  for (size_t i = 0; i < dim.size(); i++) {
    if (dim[i] < 0) {
      dim[i] += x_dims.size();
@@ -43,30 +44,16 @@ node_map_type ReduceMeanConverter(
  }
  std::sort(dim.begin(), dim.end());
-  // create reduce_mean(reduce_sum + scale) node and set input node from
+  // Create reduce_mean(using reduce_sum + scale) node and set input node from
-  // inputs_map
+  // node map
-  // creat reduce_sum node
+  auto reduce_sum_node =
-  auto unique_reduce_sum = lite::npu::UniqueName("reduce_sum");
+      graph->AddNode<ge::op::ReduceSum>(out_var_name + "/reducesum");
-  auto reduce_sum_node = std::make_shared<ge::op::ReduceSum>(unique_reduce_sum);
+  reduce_sum_node->set_input_x(*graph->GetNode(x_var_name));
-  CHECK(inputs_map.count(x_var_name));
-  reduce_sum_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(reduce_sum_node);
-  auto dim_const_node =
+  auto dim_const_node = graph->AddNode(out_var_name + "/dim", dim);
-      std::make_shared<ge::op::Const>(unique_reduce_sum + "/dim");
-  dim_const_node->set_attr_value(lite::npu::CreateTensorAndFillData<int>(dim));
  reduce_sum_node->set_input_w(*dim_const_node);
-  lite::npu::OpList::Global().add(dim_const_node);
  reduce_sum_node->set_attr_keep_dims(keep_dim);
-  // create scale node
-  auto unique_scale = lite::npu::UniqueName("scale");
-  auto scale_node = std::make_shared<ge::op::Scale>(unique_scale);
-  scale_node->set_input_x(*reduce_sum_node);
-  lite::npu::OpList::Global().add(scale_node);
  float scale = 1;
  for (size_t i = 0; i < dim.size(); i++) {
    scale /= x_dims[dim[i]];
@@ -88,24 +75,19 @@ node_map_type ReduceMeanConverter(
  }
  auto filter_const_node =
-      std::make_shared<ge::op::Const>(unique_scale + "/filter");
+      graph->AddNode(out_var_name + "/filter", scale, scale_bias_shape);
-  filter_const_node->set_attr_value(
+  auto scale_node = graph->AddNode<ge::op::Scale>(out_var_name);
-      lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
+  scale_node->set_input_x(*reduce_sum_node);
  scale_node->set_input_filter(*filter_const_node);
-  lite::npu::OpList::Global().add(filter_const_node);
  scale_node->set_attr_axis(1);
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = scale_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(reduce_mean,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::ReduceMeanConverter);
+                         reduce_mean,
+                         paddle::lite::subgraph::npu::ReduceMeanConverter);
--- a/lite/kernels/npu/bridges/registry.cc
+++ b/lite/kernels/npu/bridges/registry.cc
@@ -17,25 +17,38 @@
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
-namespace npu {
-namespace bridges {
-Factory& Factory::Instance() {
+Registry& Registry::Instance() {
-  static Factory g_npu_bridge;
+  static Registry x;
-  return g_npu_bridge;
+  return x;
 }
-bool Factory::HasType(const std::string& op_type) const {
+void Registry::Insert(const std::string& dev_type,
-  return map_.count(op_type);
+                      const std::string& op_type,
+                      const cvt_func_type& cvt_func_name) {
+  auto it = map_.find(dev_type);
+  if (it == map_.end()) {
+    map_.insert(std::make_pair(
+        dev_type, std::unordered_map<std::string, cvt_func_type>()));
+  }
+  map_.at(dev_type).insert(std::make_pair(op_type, cvt_func_name));
 }
-void Factory::Insert(const std::string& op_type, const func_type& func_name) {
+const cvt_func_type& Registry::Select(const std::string& dev_type,
-  map_.insert(std::make_pair(op_type, func_name));
+                                      const std::string& op_type) const {
+  return map_.at(dev_type).at(op_type);
 }
-}  // namespace bridges
+bool Registry::Exists(const std::string& dev_type,
-}  // namespace npu
+                      const std::string& op_type) const {
-}  // namespace kernels
+  bool found = map_.find(dev_type) != map_.end();
+  if (found) {
+    found = map_.at(dev_type).find(op_type) != map_.at(dev_type).end();
+  }
+  return found;
+}
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
@@ -15,44 +15,46 @@
 #pragma once
 #include <functional>
-#include <memory>
 #include <string>
 #include <unordered_map>
-#include <vector>
-#include "ai_ddk_lib/include/graph/operator_reg.h"
 #include "lite/core/op_lite.h"
 #include "lite/utils/macros.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
-namespace npu {
-namespace bridges {
-// var_name, npu node point
+const int FAILED = 1;
-using node_map_type =
+const int SUCCESS = 0;
-    std::unordered_map<std::string, std::shared_ptr<ge::Operator>>;
+const int REBUILD_WHEN_SHAPE_CHANGED = 2;
+inline bool CHECK_FAILED(int status) { return status & FAILED; }
+inline bool CHECK_SUCCESS(int status) { return !CHECK_FAILED(status); }
+inline bool CHECK_REBUILD_WHEN_SHAPE_CHANGED(int status) {
+  return status & REBUILD_WHEN_SHAPE_CHANGED;
+}
-using func_type = std::function<node_map_type(const std::shared_ptr<OpLite>,
+using cvt_func_type = std::function<int(void* ctx, OpLite* op)>;
-                                              const node_map_type&)>;
+using cvt_map_type =
-using cvt_map_type = std::unordered_map<std::string, func_type>;
+    std::unordered_map<std::string,
-class Factory {
+                       std::unordered_map<std::string, cvt_func_type>>;
+class Registry {
 public:
-  static Factory& Instance();
+  static Registry& Instance();
-  const cvt_map_type& AllFunctions() const { return map_; }
+  void Insert(const std::string& dev_type,
-  bool HasType(const std::string& op_type) const;
+              const std::string& op_type,
-  void Insert(const std::string& op_type, const func_type& func_name);
+              const cvt_func_type& cvt_func_name);
-  Factory() = default;
+  const cvt_func_type& Select(const std::string& dev_type,
+                              const std::string& op_type) const;
+  bool Exists(const std::string& dev_type, const std::string& op_type) const;
+  Registry() = default;
 private:
  cvt_map_type map_;
-  DISALLOW_COPY_AND_ASSIGN(Factory);
+  DISALLOW_COPY_AND_ASSIGN(Registry);
 };
-}  // namespace bridges
+}  // namespace subgraph
-}  // namespace npu
-}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
@@ -70,17 +72,18 @@ class Factory {
                             __test_global_namespace_##uniq_name##__>::value, \
                msg)
-#define REGISTER_NPU_BRIDGE(op_type, cvt_func_name)                         \
+#define REGISTER_SUBGRAPH_BRIDGE(dev_type, op_type, cvt_func_name)        \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                 \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                               \
-      __reg_npu_bridge_##op_type##__,                                       \
+      __reg_subgraph_bridge_##dev_type##_##op_type##__,                   \
-      "REGISTER_NPU_BRIDGE must be called in global namespace only once!"); \
+      "REGISTER_SUBGRAPH_BRIDGE must be called in global namespace only " \
-  int __reg_npu_bridge_##op_type##_Insert() {                               \
+      "once!");                                                           \
-    paddle::lite::kernels::npu::bridges::Factory::Instance().Insert(        \
+  int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert() {           \
-        #op_type, cvt_func_name);                                           \
+    paddle::lite::subgraph::Registry::Instance().Insert(                  \
-    return 0;                                                               \
+        #dev_type, #op_type, cvt_func_name);                              \
+    return 0;                                                             \
  }
-#define USE_NPU_BRIDGE(op_type)                                  \
+#define USE_SUBGRAPH_BRIDGE(dev_type, op_type)                            \
-  extern int __reg_npu_bridge_##op_type##_Insert();              \
+  extern int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert();     \
-  static int __reg_npu_bridge_##op_type##_Insert_return UNUSED = \
+  static int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert_return \
-      __reg_npu_bridge_##op_type##_Insert();
+      UNUSED = __reg_subgraph_bridge_##dev_type##_##op_type##_Insert();
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
@@ -13,48 +13,49 @@
 // limitations under the License.
 #include "lite/operators/reshape_op.h"
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
+int ReshapeConverter(void* ctx, OpLite* op) {
-                               const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  auto scope = reshape_op->scope();
+  CHECK(op != nullptr);
-  auto op_info = reshape_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  // get input, output and op attributes
+  // Get input, output and op attributes
  auto x_var_name = op_info->Input("X").front();
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto out_var_name = op_info->Output("Out").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
  auto x_dims = x->dims();
-  // create reshape node and set input node from inputs_map
+  // Create reshape node and set input node from inputs_map
-  auto reshape_node = std::make_shared<ge::op::Reshape>(unique_op_type);
+  auto reshape_node = graph->AddNode<ge::op::Reshape>(out_var_name);
-  CHECK(inputs_map.count(x_var_name));
+  reshape_node->set_input_tensor(*graph->GetNode(x_var_name));
-  reshape_node->set_input_tensor(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  // read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr)
+  // Read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr)
-  if (lite::npu::HasInputArg(op_info, scope, "ShapeTensor")) {
+  if (HasInputArg(op_info, scope, "ShapeTensor")) {
-    LOG(FATAL) << "[NPU] not support \"Shape\" from more than one Tensor.";
+    LOG(WARNING) << "[NPU] not support \"Shape\" from more than one Tensor.";
-  } else if (lite::npu::HasInputArg(op_info, scope, "Shape")) {
+    return FAILED;
+  } else if (HasInputArg(op_info, scope, "Shape")) {
    auto actual_shape_var_name = op_info->Input("Shape").front();
-    if (!inputs_map.count(actual_shape_var_name)) {
+    if (!graph->HasNode(actual_shape_var_name)) {
      auto actual_shape =
-          scope->FindVar(actual_shape_var_name)->GetMutable<lite::Tensor>();
+          scope->FindVar(actual_shape_var_name)->GetMutable<Tensor>();
      auto actual_shape_dims = actual_shape->dims();
      auto actual_shape_data = actual_shape->mutable_data<int>();
      auto shape =
          std::vector<int>(actual_shape_data,
                           actual_shape_data + actual_shape_dims.production());
-      auto out_dims = operators::ValidateShape(shape, x_dims);
+      auto out_dims = lite::operators::ValidateShape(shape, x_dims);
      auto out_shape = out_dims.Vectorize();
      if (out_shape.size() > 4) {
        LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
@@ -62,19 +63,15 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
                     << out_shape.size();
      }
      auto actual_shape_const_node =
-          std::make_shared<ge::op::Const>(actual_shape_var_name);
+          graph->AddNode(actual_shape_var_name,
-      actual_shape_const_node->set_attr_value(
+                         std::vector<int>(out_shape.begin(), out_shape.end()));
-          lite::npu::CreateTensorAndFillData(
-              std::vector<int>(out_shape.begin(), out_shape.end())));
      reshape_node->set_input_w(*actual_shape_const_node);
-      lite::npu::OpList::Global().add(actual_shape_const_node);
    } else {
-      reshape_node->set_input_w(*inputs_map.at(actual_shape_var_name));
+      reshape_node->set_input_w(*graph->GetNode(actual_shape_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(actual_shape_var_name));
    }
  } else {
    auto shape = op_info->GetAttr<std::vector<int>>("shape");
-    auto out_dims = operators::ValidateShape(shape, x_dims);
+    auto out_dims = lite::operators::ValidateShape(shape, x_dims);
    auto out_shape = out_dims.Vectorize();
    if (out_shape.size() > 4) {
      LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
@@ -84,12 +81,9 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
    reshape_node->set_attr_shape(
        ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
  }
-  lite::npu::OpList::Global().add(reshape_node);
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = reshape_node;
  if (op_type == "reshape2") {
-    // append an extra reshape node to calc XShape
+    // Append an extra reshape node to calc XShape
    std::vector<int64_t> xshape_dims(x_dims.size() + 1, 1);
    for (size_t i = 0; i < x_dims.size(); i++) {
      xshape_dims[i + 1] = x_dims[i];
@@ -99,24 +93,23 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
                      "but XShape has "
                   << xshape_dims.size();
    }
-    auto xshape_node =
+    auto xshape_var_name = op_info->Output("XShape").front();
-        std::make_shared<ge::op::Reshape>(unique_op_type + "/xshape");
+    auto xshape_node = graph->AddNode<ge::op::Reshape>(xshape_var_name);
-    xshape_node->set_input_tensor(*inputs_map.at(x_var_name));
+    xshape_node->set_input_tensor(*graph->GetNode(x_var_name));
    xshape_node->set_attr_shape(
        ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end()));
-    lite::npu::OpList::Global().add(xshape_node);
-    outputs_map[op_info->Output("XShape").front()] = xshape_node;
  }
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(reshape,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::ReshapeConverter);
+                         reshape,
-REGISTER_NPU_BRIDGE(reshape2,
+                         paddle::lite::subgraph::npu::ReshapeConverter);
-                    paddle::lite::kernels::npu::bridges::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         reshape2,
+                         paddle::lite::subgraph::npu::ReshapeConverter);
--- a/lite/kernels/npu/bridges/scale_op.cc
+++ b/lite/kernels/npu/bridges/scale_op.cc
@@ -12,28 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
+int ScaleConverter(void* ctx, OpLite* op) {
-                             const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  auto scope = scale_op->scope();
+  CHECK(op != nullptr);
-  auto op_info = scale_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  // get input, output and op attributes
+  // Get input, output and op attributes
  auto x_var_name = op_info->Input("X").front();
  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
  auto x_dims = x->dims().Vectorize();
  CHECK_GE(x_dims.size(), 2);
+  auto out_var_name = op_info->Output("Out").front();
  std::vector<int64_t> scale_bias_shape = {x_dims[1]};
  float scale = op_info->GetAttr<float>("scale");
  float bias = op_info->GetAttr<float>("bias");
@@ -42,43 +44,31 @@ node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
    bias *= scale;
  }
-  // create scale node and set input node from inputs_map
+  // Create scale node and set input node from inputs_map
-  auto scale_node = std::make_shared<ge::op::Scale>(unique_op_type);
+  auto scale_node = graph->AddNode<ge::op::Scale>(out_var_name);
-  CHECK(inputs_map.count(x_var_name));
+  scale_node->set_input_x(*graph->GetNode(x_var_name));
-  scale_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(scale_node);
-  // add filter node(fill with scale)
+  // Add filter node(fill with scale)
  auto filter_const_node =
-      std::make_shared<ge::op::Const>(unique_op_type + "/filter");
+      graph->AddNode(out_var_name + "/filter", scale, scale_bias_shape);
-  filter_const_node->set_attr_value(
-      lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
  scale_node->set_input_filter(*filter_const_node);
-  lite::npu::OpList::Global().add(filter_const_node);
-  // add bias node(fill with bias)
+  // Add bias node(fill with bias)
  if (fabs(bias) > 1e-6f) {
    auto bias_const_node =
-        std::make_shared<ge::op::Const>(unique_op_type + "/bias");
+        graph->AddNode(out_var_name + "/bias", bias, scale_bias_shape);
-    bias_const_node->set_attr_value(
-        lite::npu::CreateTensorAndFillData(bias, scale_bias_shape));
    scale_node->set_input_bias(*bias_const_node);
    scale_node->set_attr_has_bias_value(true);
-    lite::npu::OpList::Global().add(bias_const_node);
  }
  scale_node->set_attr_axis(1);
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = scale_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(scale, paddle::lite::kernels::npu::bridges::ScaleConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         scale,
+                         paddle::lite::subgraph::npu::ScaleConverter);
--- a/lite/kernels/npu/bridges/shuffle_channel_op.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc
@@ -12,45 +12,39 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type ShuffleChannelConverter(
+int ShuffleChannelConverter(void* ctx, OpLite* op) {
-    const std::shared_ptr<lite::OpLite> shuffle_channel_op,
+  CHECK(ctx != nullptr);
-    const node_map_type& inputs_map) {
+  CHECK(op != nullptr);
-  auto scope = shuffle_channel_op->scope();
+  auto graph = static_cast<Graph*>(ctx);
-  auto op_info = shuffle_channel_op->op_info();
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  std::shared_ptr<ge::op::ShuffleChannel> shuffle_channel_node =
-      std::make_shared<ge::op::ShuffleChannel>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto shuffle_channel_node =
+      graph->AddNode<ge::op::ShuffleChannel>(out_var_name);
-  shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name));
+  shuffle_channel_node->set_input_x(*graph->GetNode(x_var_name));
  shuffle_channel_node->set_attr_group(op_info->GetAttr<int>("group"));
+  return SUCCESS;
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(shuffle_channel_node);
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = shuffle_channel_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-    shuffle_channel,
+                         shuffle_channel,
-    paddle::lite::kernels::npu::bridges::ShuffleChannelConverter);
+                         paddle::lite::subgraph::npu::ShuffleChannelConverter);
--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
@@ -12,27 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
+int SoftmaxConverter(void* ctx, OpLite* op) {
-                               const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  auto scope = softmax_op->scope();
+  CHECK(op != nullptr);
-  auto op_info = softmax_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  std::shared_ptr<ge::op::Softmax> softmax_node =
-      std::make_shared<ge::op::Softmax>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
  auto x_dims = scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims();
  auto axis = op_info->GetAttr<int>("axis");
  if (x_dims.size() > 3) {
@@ -41,23 +40,17 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
        << "  :x_w = " << x_dims[3];
  }
-  CHECK(inputs_map.count(x_var_name));
+  auto softmax_node = graph->AddNode<ge::op::Softmax>(out_var_name);
-  softmax_node->set_input_x(*inputs_map.at(x_var_name));
+  softmax_node->set_input_x(*graph->GetNode(x_var_name));
  softmax_node->set_attr_axis(axis);
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(softmax_node);
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = softmax_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(softmax,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::SoftmaxConverter);
+                         softmax,
+                         paddle::lite::subgraph::npu::SoftmaxConverter);
--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
@@ -12,70 +12,60 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op,
+int SplitConverter(void* ctx, OpLite* op) {
-                             const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  lite::Scope* scope = split_op->scope();
+  CHECK(op != nullptr);
-  const lite::OpInfo* op_info = split_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " << op_type << " ... ";
+  VLOG(3) << "[NPU] Converting " << op_type << " ... ";
  auto x_var_name = op_info->Input("X").front();
+  auto out_var_names = op_info->Output("Out");
  auto axis = op_info->GetAttr<int>("axis");
  auto num = op_info->GetAttr<int>("num");
  auto sections = op_info->GetAttr<std::vector<int>>("sections");
  int64_t sections_num = static_cast<int64_t>(sections.size());
-  std::shared_ptr<ge::op::Split> output_node =
+  auto split_node = graph->AddNode<ge::op::Split>(op_type + "/" + x_var_name);
-      std::make_shared<ge::op::Split>(unique_op_type);
+  split_node->set_input_x(*graph->GetNode(x_var_name));
-  CHECK(inputs_map.count(x_var_name));
+  split_node->set_attr_axis(static_cast<int64_t>(axis));
-  output_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  output_node->set_attr_axis(static_cast<int64_t>(axis));
  if (num > 0) {
-    output_node->set_attr_output_num(static_cast<int64_t>(num));
+    split_node->set_attr_output_num(static_cast<int64_t>(num));
  } else {
-    output_node->set_attr_output_num(sections_num);
+    split_node->set_attr_output_num(sections_num);
    auto size_split = ge::AttrValue::LIST_INT(sections.begin(), sections.end());
-    output_node->set_attr_size_split(size_split);
+    split_node->set_attr_size_split(size_split);
  }
-  node_map_type outputs_map;
+  split_node->create_dynamic_output_y(out_var_names.size());
-  auto out_var_names = op_info->Output("Out");
+  int idx = 1;
-  output_node->create_dynamic_output_y(out_var_names.size());
+  for (auto& out_var_name : out_var_names) {
-  int index = 1;
+    auto zero_const_node =
-  for (auto out_var_name : out_var_names) {
+        graph->AddNode(out_var_name + "/zero" + std::to_string(idx), 0);
-    auto const_node = std::make_shared<ge::op::Const>(
+    auto add_node = graph->AddNode<ge::op::Add>(out_var_name);
-        unique_op_type + "/const_zero" + std::to_string(index));
+    add_node->set_input_x1(*split_node, "y" + std::to_string(idx));
-    const_node->set_attr_value(lite::npu::CreateTensorAndFillData(0));
+    add_node->set_input_x2(*zero_const_node);
-    lite::npu::OpList::Global().add(const_node);
+    idx++;
-    auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add" +
-                                                  std::to_string(index));
-    add_node->set_input_x1(*output_node, "y" + std::to_string(index));
-    add_node->set_input_x2(*const_node);
-    outputs_map[out_var_name] = add_node;
-    lite::npu::OpList::Global().add(add_node);
-    index++;
  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  lite::npu::OpList::Global().add(output_node);
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(split, paddle::lite::kernels::npu::bridges::SplitConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         split,
+                         paddle::lite::subgraph::npu::SplitConverter);
--- a/lite/kernels/npu/bridges/sqrt_op.cc
+++ b/lite/kernels/npu/bridges/sqrt_op.cc
@@ -12,43 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type SqrtConverter(const std::shared_ptr<lite::OpLite> sqrt_op,
+int SqrtConverter(void* ctx, OpLite* op) {
-                            const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  auto scope = sqrt_op->scope();
+  CHECK(op != nullptr);
-  auto op_info = sqrt_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
-  std::shared_ptr<ge::op::Sqrt> sqrt_node =
-      std::make_shared<ge::op::Sqrt>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
-  CHECK(inputs_map.count(x_var_name));
+  auto sqrt_node = graph->AddNode<ge::op::Sqrt>(out_var_name);
-  sqrt_node->set_input_x(*inputs_map.at(x_var_name));
+  sqrt_node->set_input_x(*graph->GetNode(x_var_name));
+  return SUCCESS;
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(sqrt_node);
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = sqrt_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(sqrt, paddle::lite::kernels::npu::bridges::SqrtConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, sqrt, paddle::lite::subgraph::npu::SqrtConverter);
--- a/lite/kernels/npu/bridges/square_op.cc
+++ b/lite/kernels/npu/bridges/square_op.cc
@@ -12,44 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type SquareConverter(const std::shared_ptr<lite::OpLite> square_op,
+int SquareConverter(void* ctx, OpLite* op) {
-                              const node_map_type& inputs_map) {
+  CHECK(ctx != nullptr);
-  auto scope = square_op->scope();
+  CHECK(op != nullptr);
-  auto op_info = square_op->op_info();
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
-  std::shared_ptr<ge::op::Square> square_node =
-      std::make_shared<ge::op::Square>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
-  CHECK(inputs_map.count(x_var_name));
+  auto square_node = graph->AddNode<ge::op::Square>(out_var_name);
-  square_node->set_input_x(*inputs_map.at(x_var_name));
+  square_node->set_input_x(*graph->GetNode(x_var_name));
+  return SUCCESS;
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(square_node);
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = square_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(square,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::SquareConverter);
+                         square,
+                         paddle::lite::subgraph::npu::SquareConverter);
--- a/lite/kernels/npu/bridges/transpose_op.cc
+++ b/lite/kernels/npu/bridges/transpose_op.cc
@@ -12,64 +12,45 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type TransposeConverter(
+int TransposeConverter(void* ctx, OpLite* op) {
-    const std::shared_ptr<lite::OpLite> transpose_op,
+  CHECK(ctx != nullptr);
-    const node_map_type& inputs_map) {
+  CHECK(op != nullptr);
-  auto scope = transpose_op->scope();
+  auto graph = static_cast<Graph*>(ctx);
-  auto op_info = transpose_op->op_info();
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
-  std::shared_ptr<ge::op::Permute> transpose_node =
-      std::make_shared<ge::op::Permute>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Input("Out").front();
-  // paddlelite doesn't have this input
-  // w must be set, but it does nothing
-  auto w_var_name = unique_op_type + "/w";
-  auto* w = scope->Var(w_var_name)->GetMutable<Tensor>();
-  w->Resize({1});
-  auto* w_data = w->mutable_data<float>();
-  for (int i = 0; i < w->numel(); i++) {
-    w_data[i] = 1.f;
-  }
-  auto npu_w = std::make_shared<ge::op::Const>(w_var_name);
-  npu_w->set_attr_value(lite::npu::CvtTensor(w));
-  lite::npu::OpList::Global().add(npu_w);
  auto axis = op_info->GetAttr<std::vector<int>>("axis");
-  auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end());
-  CHECK(inputs_map.count(x_var_name));
-  transpose_node->set_input_x(*inputs_map.at(x_var_name));
-  transpose_node->set_input_w(*npu_w);
-  transpose_node->set_attr_order(npu_axis);
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(transpose_node);
-  node_map_type outputs_map;
+  auto transpose_node = graph->AddNode<ge::op::Permute>(out_var_name);
-  outputs_map[op_info->Output("Out").front()] = transpose_node;
+  transpose_node->set_input_x(*graph->GetNode(x_var_name));
-  return outputs_map;
+  auto w_const_node = graph->AddNode(out_var_name + "/w", 1.0f);
+  transpose_node->set_input_w(*w_const_node);
+  transpose_node->set_attr_order(
+      ge::AttrValue::LIST_INT(axis.begin(), axis.end()));
+  return SUCCESS;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(transpose,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::TransposeConverter);
+                         transpose,
-REGISTER_NPU_BRIDGE(transpose2,
+                         paddle::lite::subgraph::npu::TransposeConverter);
-                    paddle::lite::kernels::npu::bridges::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         transpose2,
+                         paddle::lite::subgraph::npu::TransposeConverter);
--- a/lite/kernels/npu/bridges/unsqueeze_op.cc
+++ b/lite/kernels/npu/bridges/unsqueeze_op.cc
@@ -12,53 +12,45 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
-node_map_type UnsqueezeConverter(
+int UnsqueezeConverter(void* ctx, OpLite* op) {
-    const std::shared_ptr<lite::OpLite> unsqueeze_op,
+  CHECK(ctx != nullptr);
-    const node_map_type& inputs_map) {
+  CHECK(op != nullptr);
-  auto scope = unsqueeze_op->scope();
+  auto graph = static_cast<Graph*>(ctx);
-  auto op_info = unsqueeze_op->op_info();
+  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " << op_type << "... ";
-  std::shared_ptr<ge::op::Reshape> unsqueeze_node =
-      std::make_shared<ge::op::Reshape>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
-  CHECK(inputs_map.count(x_var_name));
-  unsqueeze_node->set_input_tensor(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(unsqueeze_node);
-  CHECK(op_info->HasAttr("axes"))
-      << "[NPU] unsqueeze not support axes from tensor now";
  auto out_var_name = op_info->Output("Out").front();
  auto out_shape = scope->FindTensor(out_var_name)->dims().Vectorize();
+  CHECK(op_info->HasAttr("axes"))
+      << "[NPU] unsqueeze not support axes from tensor now";
+  auto unsqueeze_node = graph->AddNode<ge::op::Reshape>(out_var_name);
+  unsqueeze_node->set_input_tensor(*graph->GetNode(x_var_name));
  unsqueeze_node->set_attr_shape(
      ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = unsqueeze_node;
-  return outputs_map;
 }
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_NPU_BRIDGE(unsqueeze,
+REGISTER_SUBGRAPH_BRIDGE(NPU,
-                    paddle::lite::kernels::npu::bridges::UnsqueezeConverter);
+                         unsqueeze,
-REGISTER_NPU_BRIDGE(unsqueeze2,
+                         paddle::lite::subgraph::npu::UnsqueezeConverter);
-                    paddle::lite::kernels::npu::bridges::UnsqueezeConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         unsqueeze2,
+                         paddle::lite::subgraph::npu::UnsqueezeConverter);
--- a/lite/backends/npu/builder.cc
+++ b/lite/backends/npu/builder.cc
@@ -12,59 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/utility.h"
-#include <mutex>  // NOLINT
 #include <utility>
-#include "lite/backends/npu/runtime.h"
 namespace paddle {
 namespace lite {
+namespace subgraph {
 namespace npu {
-// Build HIAI IR graph to om model, and store om model data into lite tensor
+bool HasInputArg(const OpInfo* op_info,
-bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
+                 const Scope* scope,
-                std::vector<ge::Operator>& outputs,  // NOLINT
+                 const std::string& argname) {
-                lite::Tensor* model_data) {
+  auto iarg_names = op_info->input_argnames();
-  LOG(INFO) << "[NPU] Build model.";
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
-  CHECK_GT(inputs.size(), 0);
+      iarg_names.end()) {
-  CHECK_GT(outputs.size(), 0);
+    auto inputs = op_info->Input(argname);
-  CHECK_NE(model_data, 0);
+    if (inputs.empty()) {
-  // build IR graph to om model
+      return false;
-  ge::Graph ir_graph("graph");
+    }
-  ir_graph.SetInputs(inputs).SetOutputs(outputs);
+    auto var_name = inputs.front();
-  ge::Model om_model("model", "model");
+    auto var = scope->FindVar(var_name);
-  om_model.SetGraph(ir_graph);
+    return var != nullptr;
-  domi::HiaiIrBuild ir_build;
-  domi::ModelBufferData om_model_buf;
-  if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
-    LOG(WARNING) << "[NPU] CreateModelBuff failed!";
-    return false;
-  }
-  if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
-    LOG(WARNING) << "[NPU] BuildIRModel failed!";
-    return false;
-  }
-  // store om model into tensor
-  model_data->Resize({om_model_buf.length});
-  memcpy(model_data->mutable_data<int8_t>(),
-         om_model_buf.data,
-         om_model_buf.length);
-  ir_build.ReleaseModelBuff(om_model_buf);
-  return true;
-}
-std::string UniqueName(const std::string& prefix) {
-  static std::mutex counter_mtx;
-  static std::unordered_map<std::string, int> counter_map;
-  std::unique_lock<std::mutex> counter_lck(counter_mtx);
-  int counter = 1;
-  auto it = counter_map.find(prefix);
-  if (it == counter_map.end()) {
-    counter_map[prefix] = counter;
  } else {
-    counter = ++(it->second);
+    return false;
  }
-  return prefix + "_" + std::to_string(counter);
 }
 ge::DataType CvtPrecisionType(PrecisionType itype) {
@@ -102,25 +73,25 @@ ge::Format CvtDataLayoutType(DataLayoutType itype) {
  return otype;
 }
-ge::TensorPtr CvtTensor(lite::Tensor* in_tensor,
+ge::TensorPtr CvtTensor(const Tensor& in_tensor,
                        std::vector<int64_t> out_shape,
                        PrecisionType in_ptype,
                        DataLayoutType in_ltype) {
-  uint8_t* in_data = nullptr;
+  const uint8_t* in_data = nullptr;
-  auto in_size = in_tensor->dims().production();
+  auto in_size = in_tensor.dims().production();
-  auto in_shape = in_tensor->dims().Vectorize();
+  auto in_shape = in_tensor.dims().Vectorize();
  if (out_shape.empty()) {
    out_shape = in_shape;
  }
  int in_bytes;
  if (in_ptype == PRECISION(kFloat)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<float>());
    in_bytes = in_size * sizeof(float);
  } else if (in_ptype == PRECISION(kInt32)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int32_t>());
    in_bytes = in_size * sizeof(int32_t);
  } else if (in_ptype == PRECISION(kInt8)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int8_t>());
    in_bytes = in_size * sizeof(int8_t);
  } else {
    LOG(FATAL) << "[NPU] Unknow precision type " << PrecisionToStr(in_ptype);
@@ -169,24 +140,7 @@ int CvtActMode(std::string act_type) {
  return act_mode;
 }
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname) {
-  auto iarg_names = op_info->input_argnames();
-  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
-      iarg_names.end()) {
-    auto inputs = op_info->Input(argname);
-    if (inputs.empty()) {
-      return false;
-    }
-    auto var_name = inputs.front();
-    auto var = scope->FindVar(var_name);
-    return var != nullptr;
-  } else {
-    return false;
-  }
-}
 }  // namespace npu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/npu/builder.h
+++ b/lite/backends/npu/builder.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <functional>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -24,12 +25,10 @@
 #include "ai_ddk_lib/include/graph/op/all_ops.h"
 #include "ai_ddk_lib/include/graph/operator.h"
 #include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "ai_ddk_lib/include/hiai_ir_build.h"
 #include "lite/core/op_lite.h"
-#include "lite/core/target_wrapper.h"
+#include "lite/utils/macros.h"
-#include "lite/core/tensor.h"
-// Extended Ops of HIAI DDK
+// Extended ops based on HIAI DDK
 namespace ge {
 /**
 * Pads a tensor.
@@ -59,39 +58,25 @@ REG_OP(Pad)
 namespace paddle {
 namespace lite {
+namespace subgraph {
 namespace npu {
-class OpList {
+// Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor
- public:
+bool HasInputArg(const OpInfo* op_info,
-  static OpList& Global() {
+                 const Scope* scope,
-    static thread_local OpList x;
+                 const std::string& argname);
-    return x;
-  }
-  void clear() { lists_.clear(); }
-  void add(std::shared_ptr<ge::Operator> p) { lists_.push_back(p); }
- private:
-  std::vector<std::shared_ptr<ge::Operator>> lists_;
-};
-// Build HIAI IR graph to om model, and store om model data into lite tensor
-bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
-                std::vector<ge::Operator>& outputs,  // NOLINT
-                lite::Tensor* model_data);
-std::string UniqueName(const std::string& prefix);
 ge::DataType CvtPrecisionType(PrecisionType itype);
 ge::Format CvtDataLayoutType(DataLayoutType itype);
-ge::TensorPtr CvtTensor(Tensor* in_tensor,
+ge::TensorPtr CvtTensor(const Tensor& in_tensor,
                        std::vector<int64_t> out_shape = {},
                        PrecisionType in_ptype = PRECISION(kFloat),
                        DataLayoutType in_ltype = DATALAYOUT(kNCHW));
 template <typename T>
-ge::TensorPtr CreateTensorAndFillData(std::vector<T> data,
+ge::TensorPtr CreateTensorAndFillData(const std::vector<T>& data,
                                      std::vector<int64_t> shape = {},
                                      ge::Format format = ge::FORMAT_NCHW) {
  const std::type_info& info = typeid(T);
@@ -136,10 +121,7 @@ ge::TensorPtr CreateTensorAndFillData(T value,
 int CvtActMode(std::string act_type);
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname);
 }  // namespace npu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/npu/graph_compute.cc
+++ b/lite/kernels/npu/graph_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/npu/graph_compute.h"
-#include <sys/time.h>
-#include <time.h>
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-void GraphCompute::PrepareForRun() {
-  auto& ctx = this->ctx_->template As<NPUContext>();
-  auto& param = this->Param<param_t>();
-  // Load HiAI model from the weight tensor and release its buffer
-  // to save memory
-  CHECK(param.weight);
-  CHECK(lite::npu::LoadModel(*param.weight, &model_client_, &model_name_));
-  // TODO(hong19860320): find an good way to free the model data.
-  // No interface exists to free the data of tensor, so I resize the dim to 1
-  // and change target to force it to realloc a small size memory.
-  param.weight->Resize({1});
-  param.weight->mutable_data<int8_t>(TargetType::kARM);
-  CHECK(model_client_);
-  // Query the dimensions of NPU input and output tensors from HiAI model
-  std::vector<hiai::TensorDimension> npu_idims;
-  std::vector<hiai::TensorDimension> npu_odims;
-  int ret =
-      model_client_->GetModelIOTensorDim(model_name_, npu_idims, npu_odims);
-  CHECK_EQ(ret, hiai::AI_SUCCESS)
-      << "[NPU] Get the dimensions of input and output tensors failed.";
-  // Check whether the data sizes of NPU input and output tensors are the
-  // same as CPU's, then create and initialize NPU input and output tensors.
-  npu_itensors_.resize(npu_idims.size());
-  npu_otensors_.resize(npu_odims.size());
-  npu_idatasizes_.resize(npu_idims.size());
-  npu_odatasizes_.resize(npu_odims.size());
-  for (size_t i = 0; i < npu_idims.size(); ++i) {
-    auto cpu_itensor = param.inputs[i].second;
-    CHECK(cpu_itensor);
-    VLOG(3) << "[NPU] CPU input dims[" << i << "]: " << cpu_itensor->dims();
-    VLOG(3) << "[NPU] NPU input dims[" << i << "]: {"
-            << npu_idims[i].GetNumber() << "," << npu_idims[i].GetChannel()
-            << "," << npu_idims[i].GetHeight() << "," << npu_idims[i].GetWidth()
-            << "}";
-    npu_idatasizes_[i] = npu_idims[i].GetNumber() * npu_idims[i].GetChannel() *
-                         npu_idims[i].GetHeight() * npu_idims[i].GetWidth();
-    CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]);
-    npu_itensors_[i].reset(new hiai::AiTensor);
-    npu_itensors_[i]->Init(&(npu_idims[i]));
-  }
-  for (size_t i = 0; i < npu_odims.size(); ++i) {
-    auto cpu_otensor = param.outputs[i].second;
-    CHECK(cpu_otensor);
-    VLOG(3) << "[NPU] CPU output dims[" << i << "]: " << cpu_otensor->dims();
-    VLOG(3) << "[NPU] NPU output dims[" << i << "]: {"
-            << npu_odims[i].GetNumber() << "," << npu_odims[i].GetChannel()
-            << "," << npu_odims[i].GetHeight() << "," << npu_odims[i].GetWidth()
-            << "}";
-    npu_odatasizes_[i] = npu_odims[i].GetNumber() * npu_odims[i].GetChannel() *
-                         npu_odims[i].GetHeight() * npu_odims[i].GetWidth();
-    if (cpu_otensor->dims().production() != npu_odatasizes_[i]) {
-      cpu_otensor->Resize({npu_odims[i].GetNumber(),
-                           npu_odims[i].GetChannel(),
-                           npu_odims[i].GetHeight(),
-                           npu_odims[i].GetWidth()});
-    }
-    npu_otensors_[i].reset(new hiai::AiTensor);
-    npu_otensors_[i]->Init(&(npu_odims[i]));
-  }
-}
-void GraphCompute::Run() {
-  auto& param = this->Param<param_t>();
-  // Check whether the data sizes of NPU input tensors are the same as
-  // CPU's, and copy the data of CPU input tensors to NPU's.
-  CHECK_EQ(param.inputs.size(), npu_itensors_.size());
-  CHECK_EQ(param.outputs.size(), npu_otensors_.size());
-  for (size_t i = 0; i < param.inputs.size(); ++i) {
-    auto cpu_itensor = param.inputs[i].second;
-    CHECK(cpu_itensor);
-    CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]);
-    std::memcpy(static_cast<float*>(npu_itensors_[i]->GetBuffer()),
-                cpu_itensor->data<float>(),
-                sizeof(float) * static_cast<size_t>(npu_idatasizes_[i]));
-  }
-  // Run HiAI model with model name
-  std::string key = "model_name";  // Note: key seems must be model_name
-  model_context_.AddPara(key, model_name_);
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  int istamp;
-  auto start_time = GetCurrentUS();
-  CHECK_EQ(hiai::AI_SUCCESS,
-           model_client_->Process(
-               model_context_, npu_itensors_, npu_otensors_, 1000, istamp));
-  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
-  // Check whether the data sizes of NPU output tensors are the same as
-  // CPU's, and copy the data of NPU output tensors to CPU's.
-  for (size_t i = 0; i < param.outputs.size(); ++i) {
-    auto cpu_otensor = param.outputs[i].second;
-    CHECK(cpu_otensor);
-    CHECK_EQ(cpu_otensor->dims().production(), npu_odatasizes_[i]);
-    std::memcpy(cpu_otensor->mutable_data<float>(),
-                static_cast<float*>(npu_otensors_[i]->GetBuffer()),
-                sizeof(float) * static_cast<size_t>(npu_odatasizes_[i]));
-  }
-}
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-REGISTER_LITE_KERNEL(graph_op,
-                     kNPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::npu::GraphCompute,
-                     def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/npu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "ai_ddk_lib/include/hiai_ir_build.h"
+#include "lite/backends/npu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+int SubgraphEngine::BuildDeviceProgram() {
+  int status = 0;
+  // Convert all of input data vars and added into the HiAI IR graph
+  subgraph::npu::Graph graph;
+  for (auto& input_name : input_names_) {
+    auto input_tensor = scope_->FindMutableTensor(input_name);
+    CHECK(input_tensor);
+    auto input_node =
+        graph.AddNode(input_name, input_tensor->dims().Vectorize());
+    CHECK(input_node);
+    // HiAI DDK doesn't support dynamic dimensions/shapes, so need to rebuild
+    // the program when the shape of any input tensor is changed.
+    status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
+  }
+  // Convert all of ops and its weights and added into the HiAI IR graph
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = inst.op();
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists("NPU", op_type)) {
+      return subgraph::FAILED;
+    }
+    status |= bridges.Select("NPU", op_type)(reinterpret_cast<void*>(&graph),
+                                             const_cast<OpLite*>(op));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  // Set the input and output nodes of the HiAI IR graph
+  std::vector<ge::Operator> input_nodes, output_nodes;
+  for (auto& input_name : input_names_) {
+    input_nodes.push_back(*graph.GetNode(input_name));
+  }
+  for (auto& output_name : output_names_) {
+    output_nodes.push_back(*graph.GetNode(output_name));
+  }
+  // Build the HiAI IR graph to HiAI om model
+  device_program_ =
+      lite::npu::Device::Global().Build(model_name_, input_nodes, output_nodes);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[NPU] Build model failed!";
+    return subgraph::FAILED;
+  }
+  // Query and check the dimensions of input and output tensors
+  std::vector<hiai::TensorDimension> device_idims, device_odims;
+  if (device_program_->GetModelIOTensorDim(
+          model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
+    LOG(WARNING)
+        << "[NPU] Get the dimensions of input and output tensors failed!";
+    return subgraph::FAILED;
+  }
+  CHECK_EQ(device_idims.size(), input_names_.size());
+  CHECK_EQ(device_odims.size(), output_names_.size());
+  origin_idims_.resize(input_names_.size());
+  origin_itensors_.resize(input_names_.size());
+  device_idatasizes_.resize(input_names_.size());
+  device_itensors_.resize(input_names_.size());
+  origin_odims_.resize(output_names_.size());
+  origin_otensors_.resize(output_names_.size());
+  device_odatasizes_.resize(output_names_.size());
+  device_otensors_.resize(output_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+    VLOG(3) << "[NPU] Input dims[" << i << "]: {" << device_idims[i].GetNumber()
+            << "," << device_idims[i].GetChannel() << ","
+            << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth()
+            << "}";
+    device_idatasizes_[i] =
+        device_idims[i].GetNumber() * device_idims[i].GetChannel() *
+        device_idims[i].GetHeight() * device_idims[i].GetWidth();
+    CHECK_EQ(device_idatasizes_[i], origin_idims_[i].production());
+    device_itensors_[i].reset(new hiai::AiTensor);
+    device_itensors_[i]->Init(&(device_idims[i]));
+  }
+  for (int i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    VLOG(3) << "[NPU] Output dims[" << i << "]: {"
+            << device_odims[i].GetNumber() << ","
+            << device_odims[i].GetChannel() << ","
+            << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth()
+            << "}";
+    device_odatasizes_[i] =
+        device_odims[i].GetNumber() * device_odims[i].GetChannel() *
+        device_odims[i].GetHeight() * device_odims[i].GetWidth();
+    CHECK_EQ(device_odatasizes_[i], origin_odims_[i].production());
+    device_otensors_[i].reset(new hiai::AiTensor);
+    device_otensors_[i]->Init(&(device_odims[i]));
+  }
+  return status;
+}
+int SubgraphEngine::LaunchDeviceProgram() {
+  // Copy the data of origin input tensors to the buffer of input HiAI tensors
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    std::memcpy(static_cast<float*>(device_itensors_[i]->GetBuffer()),
+                origin_itensors_[i]->mutable_data<float>(),
+                sizeof(float) * static_cast<size_t>(device_idatasizes_[i]));
+  }
+  // Run the HiAI model by name
+  std::string key = "model_name";  // Note: key seems must be model_name
+  model_context_.AddPara(key, model_name_);
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  int istamp;
+  auto start_time = GetCurrentUS();
+  CHECK_EQ(
+      device_program_->Process(
+          model_context_, device_itensors_, device_otensors_, 1000, istamp),
+      hiai::AI_SUCCESS);
+  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
+  // Copy the data of output HiAI tensor to the buffer of origin output tensors
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    std::memcpy(origin_otensors_[i]->mutable_data<float>(),
+                static_cast<float*>(device_otensors_[i]->GetBuffer()),
+                sizeof(float) * static_cast<size_t>(device_odatasizes_[i]));
+  }
+  return 0;
+}
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Launch();
+}
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(subgraph,
+                     kNPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::npu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
--- a/lite/core/mir/subgraph/generate_npu_program_pass.h
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.h
@@ -14,48 +14,57 @@
 #pragma once
-#include <map>
 #include <memory>
 #include <string>
-#include <unordered_map>
-#include <unordered_set>
 #include <vector>
-#include "lite/backends/npu/builder.h"
+#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "lite/core/mir/pass.h"
+#include "lite/core/kernel.h"
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
+#include "lite/kernels/npu/bridges/engine.h"
 #include "lite/kernels/npu/bridges/registry.h"
 namespace paddle {
 namespace lite {
-namespace mir {
+namespace kernels {
-namespace subgraph {
+namespace npu {
-class GenerateNPUProgramPass : public SubgraphProgramPass {
+class SubgraphEngine : public subgraph::Engine {
 public:
-  using key2nodes_t = std::map<std::string, Node*>;
+  SubgraphEngine(int block_idx,
+                 cpp::BlockDesc *block_desc,
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            block_idx, block_desc, input_names, output_names, scope) {}
 protected:
-  // nodes2cvt: op nodes to convert
+  int BuildDeviceProgram() override;
-  // return cvted_vars: converted var nodes
+  int LaunchDeviceProgram() override;
-  void CvtAllOpNodes(const std::vector<Node*>& nodes2cvt,
-                     lite::kernels::npu::bridges::node_map_type* cvted_vars);
+  std::string model_name_;
+  hiai::AiContext model_context_;
-  std::shared_ptr<ge::Operator> CvtVarNode(lite::mir::Node* var_node,
+  std::vector<int64_t> device_idatasizes_;
-                                           const Scope* scope);
+  std::vector<int64_t> device_odatasizes_;
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_;
-  std::string BuildNPUGraph(const std::unordered_set<Node*>& op_nodes,
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_;
-                            const std::unordered_set<Node*>& in_data_vars,
+  std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr};
-                            const std::unordered_set<Node*>& out_data_vars,
+};
-                            int sub_id);
+class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> {
-  void GenNPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
+ public:
-                      const std::unordered_set<Node*>& op_nodes,
+  using param_t = operators::SubgraphParam;
-                      int sub_id);
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~SubgraphCompute() = default;
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
 };
-}  // namespace subgraph
+}  // namespace npu
-}  // namespace mir
+}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
-if(NOT LITE_WITH_XPU)
-  return ()
-endif()
-add_kernel(graph_compute_xpu XPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} xpu_runtime)
-# lite_cc_test(test_graph_compute_xpu SRCS graph_compute_test.cc DEPS graph_compute_xpu)
 add_subdirectory(bridges)
+add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
-lite_cc_library(xpu_bridge_registry SRCS registry.cc)
+if(NOT LITE_WITH_XPU)
+  return()
+endif()
-set(xpu_bridge_deps xpu_bridge_registry xpu_builder op)
+lite_cc_library(subgraph_bridge_utility_xpu SRCS utility.cc DEPS ${xpu_builder_libs} tensor)
+lite_cc_library(subgraph_bridge_graph_xpu SRCS graph.cc DEPS subgraph_bridge_utility_xpu)
-lite_cc_library(xpu_bridge_act_op SRCS act_op.cc DEPS ${xpu_bridge_deps})
+set(xpu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_xpu subgraph_bridge_graph_xpu)
-lite_cc_library(xpu_bridge_conv_op SRCS conv_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_elementwise_ops SRCS elementwise_ops.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_pool_op SRCS pool_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_softmax_op SRCS softmax_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_mul_op SRCS mul_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_batch_norm_op SRCS batch_norm_op.cc DEPS ${xpu_bridge_deps})
-set(xpu_bridges
+lite_cc_library(subgraph_bridge_act_op_xpu SRCS act_op.cc DEPS ${xpu_subgraph_bridge_deps})
-        xpu_bridge_registry
+lite_cc_library(subgraph_bridge_conv_op_xpu SRCS conv_op.cc DEPS ${xpu_subgraph_bridge_deps})
-        xpu_bridge_act_op
+lite_cc_library(subgraph_bridge_elementwise_ops_xpu SRCS elementwise_ops.cc DEPS ${xpu_subgraph_bridge_deps})
-        xpu_bridge_conv_op
+lite_cc_library(subgraph_bridge_pool_op_xpu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_xpu})
-        xpu_bridge_elementwise_ops
+lite_cc_library(subgraph_bridge_softmax_op_xpu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_xpu})
-        xpu_bridge_pool_op
+lite_cc_library(subgraph_bridge_mul_op_xpu SRCS mul_op.cc DEPS ${xpu_subgraph_bridge_deps})
-        xpu_bridge_softmax_op
+lite_cc_library(subgraph_bridge_batch_norm_op_xpu SRCS batch_norm_op.cc DEPS ${xpu_subgraph_bridge_deps})
-        xpu_bridge_mul_op
-        xpu_bridge_batch_norm_op
-        CACHE INTERNAL "xpu_bridges")
-set(xpu_bridge_test_deps ${xpu_bridges} ${xpu_kernels} ${ops})
+set(xpu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_xpu
+        subgraph_bridge_graph_xpu
+        subgraph_bridge_act_op_xpu
+        subgraph_bridge_conv_op_xpu
+        subgraph_bridge_elementwise_ops_xpu
+        subgraph_bridge_pool_op_xpu
+        subgraph_bridge_softmax_op_xpu
+        subgraph_bridge_mul_op_xpu
+        subgraph_bridge_batch_norm_op_xpu
+        CACHE INTERNAL "xpu_subgraph_bridges")
-lite_cc_test(test_xpu_bridge_act_op SRCS act_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
+message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}")
-lite_cc_test(test_xpu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_elementwise_ops SRCS elementwise_ops_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_pool_op SRCS pool_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_softmax_op SRCS softmax_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_mul_op SRCS mul_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_batch_norm_op SRCS batch_norm_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
--- a/lite/kernels/xpu/bridges/act_op.cc
+++ b/lite/kernels/xpu/bridges/act_op.cc
@@ -12,51 +12,41 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
-node_map_type ActConverter(const std::shared_ptr<lite::OpLite> op,
+int ActConverter(void* ctx, OpLite* op) {
-                           graph_ctx_type* graph_ctx,
+  CHECK(ctx != nullptr);
-                           const node_map_type& input_nodes) {
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
-  // check context
+  // Create act node and set params from op
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-  // create act node and set params from op
  auto x_var_name = op_info->Input("X").front();
-  CHECK(input_nodes.count(x_var_name));
+  auto out_var_name = op_info->Output("Out").front();
-  std::shared_ptr<xtcl::xExpr> act_node = nullptr;
+  CHECK(graph->HasNode(x_var_name));
  if (op_type == "relu") {
-    act_node = std::make_shared<xtcl::xExpr>(
+    graph->AddNode(out_var_name,
-        graph_ctx->builder->CreateRelu(*input_nodes.at(x_var_name)));
+                   graph->builder_.CreateRelu(*graph->GetNode(x_var_name)));
  } else {
    // TODO(hong19860320) supports more activation ops
-    LOG(FATAL) << "[XPU] Unsupported activation type " << op_type;
+    LOG(WARNING) << "[XPU] Unsupported activation type " << op_type;
+    return FAILED;
  }
-  graph_ctx->builder->SetLayer(unique_op_type);
+  return SUCCESS;
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = act_node;
-  return output_nodes;
 }
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_XPU_BRIDGE(relu, paddle::lite::kernels::xpu::bridges::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU, relu, paddle::lite::subgraph::xpu::ActConverter);
--- a/lite/kernels/xpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/xpu/bridges/batch_norm_op.cc
@@ -12,30 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
-node_map_type BatchNormConverter(const std::shared_ptr<lite::OpLite> op,
+int BatchNormConverter(void* ctx, OpLite* op) {
-                                 graph_ctx_type* graph_ctx,
+  CHECK(ctx != nullptr);
-                                 const node_map_type& input_nodes) {
+  CHECK(op != nullptr);
-  auto scope = op->scope();
+  auto graph = static_cast<Graph*>(ctx);
  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-  // get input, and attributes
+  // Get input vars and op attributes
  auto x_var_name = op_info->Input("X").front();
  auto scale_var_name = op_info->Input("Scale").front();
  auto* scale = scope->FindMutableTensor(scale_var_name);
@@ -45,69 +40,33 @@ node_map_type BatchNormConverter(const std::shared_ptr<lite::OpLite> op,
  auto* mean = scope->FindMutableTensor(mean_var_name);
  auto variance_var_name = op_info->Input("Variance").front();
  auto* variance = scope->FindMutableTensor(variance_var_name);
+  auto y_var_name = op_info->Output("Y").front();
  auto epsilon = op_info->GetAttr<float>("epsilon");
-  // create scale node
+  // Create scale, bias, mean, variance nodes
-  CHECK(!input_nodes.count(scale_var_name));
+  auto scale_const_node = graph->AddNode(scale_var_name, *scale);
-  auto scale_const_node = std::make_shared<xtcl::xExpr>(
+  auto bias_const_node = graph->AddNode(bias_var_name, *bias);
-      graph_ctx->builder->CreateTensor(scale_var_name,
+  auto mean_const_node = graph->AddNode(mean_var_name, *mean);
-                                       lite::xpu::CvtShape(scale->dims()),
+  auto variance_const_node = graph->AddNode(variance_var_name, *variance);
-                                       ::xtcl::Float(32)));
-  auto scale_const_tensor = lite::xpu::CvtTensor(scale);
-  graph_ctx->params->emplace(
-      std::make_pair(scale_var_name, *scale_const_tensor));
-  // create bias node
-  CHECK(!input_nodes.count(bias_var_name));
-  auto bias_const_node =
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-          bias_var_name, lite::xpu::CvtShape(bias->dims()), ::xtcl::Float(32)));
-  auto bias_const_tensor = lite::xpu::CvtTensor(bias);
-  graph_ctx->params->emplace(std::make_pair(bias_var_name, *bias_const_tensor));
-  // create mean node
-  CHECK(!input_nodes.count(mean_var_name));
-  auto mean_const_node =
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-          mean_var_name, lite::xpu::CvtShape(mean->dims()), ::xtcl::Float(32)));
-  auto mean_const_tensor = lite::xpu::CvtTensor(mean);
-  graph_ctx->params->emplace(std::make_pair(mean_var_name, *mean_const_tensor));
-  // create variance node
-  CHECK(!input_nodes.count(variance_var_name));
-  auto variance_const_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateTensor(variance_var_name,
-                                       lite::xpu::CvtShape(variance->dims()),
-                                       ::xtcl::Float(32)));
-  auto variance_const_tensor = lite::xpu::CvtTensor(variance);
-  graph_ctx->params->emplace(
-      std::make_pair(variance_var_name, *variance_const_tensor));
-  // create batch_norm node and set params from op
-  CHECK(input_nodes.count(x_var_name));
-  auto batch_norm_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateBatchNorm(*input_nodes.at(x_var_name),
-                                          *scale_const_node,
-                                          *bias_const_node,
-                                          *mean_const_node,
-                                          *variance_const_node,
-                                          1,
-                                          epsilon));
-  batch_norm_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->GetField(*batch_norm_node, 0));
-  graph_ctx->builder->SetLayer(unique_op_type);
-  // output converted nodes
+  // Create batch_norm node and set params from op
-  node_map_type output_nodes;
+  auto batch_norm_node =
-  output_nodes[op_info->Output("Y").front()] = batch_norm_node;
+      graph->builder_.CreateBatchNorm(*graph->GetNode(x_var_name),
-  return output_nodes;
+                                      *scale_const_node,
+                                      *bias_const_node,
+                                      *mean_const_node,
+                                      *variance_const_node,
+                                      1,
+                                      epsilon);
+  graph->AddNode(y_var_name, graph->builder_.GetField(batch_norm_node, 0));
+  return SUCCESS;
 }
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_XPU_BRIDGE(batch_norm,
+REGISTER_SUBGRAPH_BRIDGE(XPU,
-                    paddle::lite::kernels::xpu::bridges::BatchNormConverter);
+                         batch_norm,
+                         paddle::lite::subgraph::xpu::BatchNormConverter);
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
@@ -13,31 +13,32 @@
 // limitations under the License.
 #include "lite/operators/conv_op.h"
-#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
-node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
+int ConvConverter(void* ctx, OpLite* op) {
-                            graph_ctx_type* graph_ctx,
+  CHECK(ctx != nullptr);
-                            const node_map_type& input_nodes) {
+  CHECK(op != nullptr);
-  auto scope = op->scope();
+  auto graph = static_cast<Graph*>(ctx);
  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[XPU] Converting " << op_type << "... ";
+  VLOG(3) << "[XPU] Converting " << op_type << "... ";
-  // get input, filter and op attributes
+  // Get input, filter and op attributes
  auto input_var_name = op_info->Input("Input").front();
-  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input = scope->FindVar(input_var_name)->GetMutable<Tensor>();
  auto input_dims = input->dims();
  auto filter_var_name = op_info->Input("Filter").front();
-  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+  auto filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
  auto filter_dims = filter->dims();
+  auto output_var_name = op_info->Output("Output").front();
  auto bs = input_dims[0];
  auto oc = filter_dims[0];
  CHECK_EQ(input_dims.size(), 4);
@@ -80,26 +81,14 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
  }
  DDim output_dims(output_shape);
-  // check context
+  // Create filter node
-  CHECK(graph_ctx != nullptr);
+  auto filter_const_node = graph->AddNode(filter_var_name, *filter);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-  // create filter node
-  CHECK(!input_nodes.count(filter_var_name));
-  auto filter_const_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateTensor(filter_var_name,
-                                       lite::xpu::CvtShape(filter_dims),
-                                       ::xtcl::Float(32)));
-  auto filter_const_tensor = lite::xpu::CvtTensor(filter);
-  graph_ctx->params->emplace(
-      std::make_pair(filter_var_name, *filter_const_tensor));
-  // create conv node and set input, filter, bias nodes and attributes
+  // Create conv node and set input, filter, bias nodes and attributes
  auto conv_attrs = xtcl::make_node<xtcl::network::Conv2DAttrs>();
-  conv_attrs->strides = std::move(lite::xpu::CvtShape(strides));
+  conv_attrs->strides = std::move(CvtShape(strides));
-  conv_attrs->padding = std::move(lite::xpu::CvtShape(paddings));
+  conv_attrs->padding = std::move(CvtShape(paddings));
-  conv_attrs->dilation = std::move(lite::xpu::CvtShape(dilations));
+  conv_attrs->dilation = std::move(CvtShape(dilations));
  conv_attrs->groups = groups;
  // conv_attrs->channels = nullptr;
  conv_attrs->kernel_size = std::move(xtcl::Array<xtcl::xIndexExpr>(nullptr));
@@ -107,20 +96,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
  conv_attrs->kernel_layout = "OIHW";
  conv_attrs->out_layout = "";
  // conv_attrs->out_dtype = "";
-  CHECK(input_nodes.count(input_var_name));
+  auto conv_node = graph->AddNode(
-  auto conv_node =
+      output_var_name,
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateConv2D(
+      graph->builder_.CreateConv2D(
-          *input_nodes.at(input_var_name), *filter_const_node, conv_attrs));
+          *graph->GetNode(input_var_name), *filter_const_node, conv_attrs));
-  graph_ctx->builder->SetLayer(unique_op_type);
-  // create bias node if has bias
+  // Create bias node if exists bias
  // supports the bias nodes with the following dimensions
  // 0: {oc}
  // 1: {1, oc, oh, ow}
  // 2: {n, oc, oh, ow}
-  if (lite::xpu::HasInputArg(op_info, scope, "Bias")) {
+  if (HasInputArg(op_info, scope, "Bias")) {
    auto bias_var_name = op_info->Input("Bias").front();
-    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
    auto bias_dims = bias->dims();
    auto bias_data_size = bias_dims.production();
    auto output_data_size = output_dims.production();
@@ -137,57 +125,46 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
      // 2: {n, oc, oh, ow}
      bias_shape = output_dims.Vectorize();
    } else {
-      LOG(ERROR) << "bias dimension " << bias_dims
+      LOG(ERROR) << "[XPU] Bias dimension " << bias_dims
                 << " isn't supported in conv2d Op when output dimension is "
                 << output_dims;
    }
    std::shared_ptr<xtcl::xExpr> bias_node = nullptr;
-    if (input_nodes.count(bias_var_name)) {
+    if (graph->HasNode(bias_var_name)) {
-      // bias node from input node
+      // Bias node from input node
-      bias_node = input_nodes.at(bias_var_name);
+      bias_node = graph->GetNode(bias_var_name);
    } else {
-      // bias node with const tensor
+      // Bias node with const tensor
-      auto bias_const_node = std::make_shared<xtcl::xExpr>(
+      bias_node = graph->AddNode(bias_var_name, *bias, bias_shape);
-          graph_ctx->builder->CreateTensor(bias_var_name,
-                                           lite::xpu::CvtShape(bias_shape),
-                                           ::xtcl::Float(32)));
-      auto bias_const_tensor = lite::xpu::CvtTensor(bias, bias_shape);
-      graph_ctx->params->emplace(
-          std::make_pair(bias_var_name, *bias_const_tensor));
-      bias_node = bias_const_node;
    }
    std::shared_ptr<xtcl::xExpr> add_node = nullptr;
    if (is_channel_bias) {
-      add_node = std::make_shared<xtcl::xExpr>(
+      add_node = graph->AddNode(
-          graph_ctx->builder->CreateBiasAdd(*conv_node, 1, *bias_node));
+          output_var_name,
+          graph->builder_.CreateBiasAdd(*conv_node, 1, *bias_node));
    } else {
-      add_node = std::make_shared<xtcl::xExpr>(
+      add_node = graph->AddNode(
-          graph_ctx->builder->CreateBinaryOp("add", *conv_node, *bias_node));
+          output_var_name,
+          graph->builder_.CreateBinaryOp("add", *conv_node, *bias_node));
    }
-    graph_ctx->builder->SetLayer(unique_op_type + "/add");
    conv_node = add_node;
  }
-  // output converted nodes
-  node_map_type output_nodes;
  if (fuse_relu) {
-    // append relu node if fuse_relu is true
+    // Append relu node if fuse_relu is true
-    auto relu_node = std::make_shared<xtcl::xExpr>(
+    graph->AddNode(output_var_name, graph->builder_.CreateRelu(*conv_node));
-        graph_ctx->builder->CreateRelu(*conv_node));
-    graph_ctx->builder->SetLayer(unique_op_type + "/relu");
-    output_nodes[op_info->Output("Output").front()] = relu_node;
-  } else {
-    output_nodes[op_info->Output("Output").front()] = conv_node;
  }
-  return output_nodes;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_XPU_BRIDGE(conv2d, paddle::lite::kernels::xpu::bridges::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
-REGISTER_XPU_BRIDGE(depthwise_conv2d,
+                         conv2d,
-                    paddle::lite::kernels::xpu::bridges::ConvConverter);
+                         paddle::lite::subgraph::xpu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         depthwise_conv2d,
+                         paddle::lite::subgraph::xpu::ConvConverter);
--- a/lite/kernels/xpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/xpu/bridges/elementwise_ops.cc
@@ -12,85 +12,72 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
-node_map_type ElementwiseConverter(const std::shared_ptr<lite::OpLite> op,
+int ElementwiseConverter(void* ctx, OpLite* op) {
-                                   graph_ctx_type* graph_ctx,
+  CHECK(op != nullptr);
-                                   const node_map_type& input_nodes) {
+  CHECK(ctx != nullptr);
-  auto scope = op->scope();
+  auto graph = static_cast<Graph*>(ctx);
  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-  // get input, and attributes
+  // Get input, and attributes
  auto x_var_name = op_info->Input("X").front();
  auto y_var_name = op_info->Input("Y").front();
+  auto out_var_name = op_info->Output("Out").front();
  auto axis = op_info->GetAttr<int>("axis");
-  auto x_tensor = scope->FindMutableTensor(x_var_name);
+  auto x = scope->FindMutableTensor(x_var_name);
-  auto y_tensor = scope->FindMutableTensor(y_var_name);
+  auto y = scope->FindMutableTensor(y_var_name);
-  auto x_dims = x_tensor->dims();
+  auto x_dims = x->dims();
-  auto y_dims = y_tensor->dims();
+  auto y_dims = y->dims();
-  // create x and y node
+  // Create x and y node
  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (input_nodes.count(x_var_name)) {
+  if (graph->HasNode(x_var_name)) {
-    x_node = input_nodes.at(x_var_name);
+    x_node = graph->GetNode(x_var_name);
  } else {
-    x_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
+    x_node = graph->AddNode(x_var_name, *x);
-        x_var_name, lite::xpu::CvtShape(x_dims), ::xtcl::Float(32)));
-    auto x_const_tensor = lite::xpu::CvtTensor(x_tensor);
-    graph_ctx->params->emplace(std::make_pair(x_var_name, *x_const_tensor));
  }
  std::shared_ptr<xtcl::xExpr> y_node = nullptr;
-  if (input_nodes.count(y_var_name)) {
+  if (graph->HasNode(y_var_name)) {
-    y_node = input_nodes.at(y_var_name);
+    y_node = graph->GetNode(y_var_name);
  } else {
-    y_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
+    y_node = graph->AddNode(y_var_name, *y);
-        y_var_name, lite::xpu::CvtShape(y_dims), ::xtcl::Float(32)));
-    auto y_const_tensor = lite::xpu::CvtTensor(y_tensor);
-    graph_ctx->params->emplace(std::make_pair(y_var_name, *y_const_tensor));
  }
-  // create elementwise node and set input, attributes
+  // Create elementwise node and set input, attributes
  std::shared_ptr<xtcl::xExpr> elementwise_node = nullptr;
  if (y_dims.size() == 1) {
-    elementwise_node = std::make_shared<xtcl::xExpr>(
+    elementwise_node = graph->AddNode(
-        graph_ctx->builder->CreateBiasAdd(*x_node, axis, *y_node));
+        out_var_name, graph->builder_.CreateBiasAdd(*x_node, axis, *y_node));
  } else if (x_dims.size() == y_dims.size()) {
-    elementwise_node = std::make_shared<xtcl::xExpr>(
+    elementwise_node = graph->AddNode(
-        graph_ctx->builder->CreateBinaryOp("add", *x_node, *y_node));
+        out_var_name, graph->builder_.CreateBinaryOp("add", *x_node, *y_node));
  } else {
-    LOG(ERROR) << "XPU elementwise_add only support y of one dimension, or x "
+    LOG(WARNING)
-                  "and y of the same dimension. But recieved x's dimension: "
+        << "[XPU] elementwise_add only support y of one dimension, or x "
-               << x_dims << ", y's dimension: " << y_dims << ", axis: " << axis;
+           "and y of the same dimension. But recieved x's dimension: "
+        << x_dims << ", y's dimension: " << y_dims << ", axis: " << axis;
+    return FAILED;
  }
-  graph_ctx->builder->SetLayer(unique_op_type);
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = elementwise_node;
-  return output_nodes;
 }
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_XPU_BRIDGE(elementwise_add,
+REGISTER_SUBGRAPH_BRIDGE(XPU,
-                    paddle::lite::kernels::xpu::bridges::ElementwiseConverter);
+                         elementwise_add,
+                         paddle::lite::subgraph::xpu::ElementwiseConverter);
--- a/lite/kernels/xpu/bridges/graph.cc
+++ b/lite/kernels/xpu/bridges/graph.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/xpu/bridges/utility.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
+                                            const xtcl::xExpr& layer) {
+  auto unique_name = [&](const std::string& key) {
+    int idx = 1;
+    auto it = counts_.find(key);
+    if (it == counts_.end()) {
+      counts_.insert(std::make_pair(key, idx));
+    } else {
+      idx = ++(it->second);
+    }
+    return key + "_" + std::to_string(idx);
+  };
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    CHECK(params_.find(name) == params_.end()) << "[XPU] Node " << name
+                                               << " redefined.";
+    // Generate a new unique name as the key to bind the origin node if the
+    // origin node isn't a const node: new_name->node
+    nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second));
+    nodes_.erase(it);
+  }
+  // Create a new node and bind with the name: name->new_node
+  auto node = std::make_shared<xtcl::xExpr>(layer);
+  nodes_.insert(std::make_pair(name, node));
+  builder_.SetLayer(unique_name(name + "_op"));
+  return node;
+}
+// Const node
+std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
+                                            const Tensor& tensor,
+                                            PrecisionType ptype,
+                                            DataLayoutType ltype) {
+  return AddNode(name, tensor, tensor.dims().Vectorize(), ptype, ltype);
+}
+std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
+                                            const Tensor& tensor,
+                                            std::vector<int64_t> shape,
+                                            PrecisionType ptype,
+                                            DataLayoutType ltype) {
+  auto node = AddNode(name, shape, ptype, ltype);
+  params_.emplace(
+      std::make_pair(name, *CvtTensor(tensor, shape, ptype, ltype)));
+  return node;
+}
+// Data node
+std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
+                                            std::vector<int64_t> shape,
+                                            PrecisionType ptype,
+                                            DataLayoutType ltype) {
+  CHECK(!HasNode(name));
+  auto node = std::make_shared<xtcl::xExpr>(
+      builder_.CreateTensor(name, CvtShape(shape), CvtPrecisionType(ptype)));
+  nodes_.insert(std::make_pair(name, node));
+  return node;
+}
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <xtcl/xtcl.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+// The Context of the converters which used for converting the ops of subgraph
+// to the XPU IR graph
+class Graph {
+ public:
+  // Layer node
+  std::shared_ptr<xtcl::xExpr> AddNode(const std::string& name,
+                                       const xtcl::xExpr& layer);
+  // Const node
+  std::shared_ptr<xtcl::xExpr> AddNode(
+      const std::string& name,
+      const Tensor& tensor,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+  std::shared_ptr<xtcl::xExpr> AddNode(
+      const std::string& name,
+      const Tensor& tensor,
+      std::vector<int64_t> shape,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+  template <typename T>
+  std::shared_ptr<xtcl::xExpr> AddNode(
+      const std::string& name,
+      const std::vector<T>& data,
+      std::vector<int64_t> shape = {},
+      DataLayoutType ltype = DATALAYOUT(kNCHW)) {
+    const std::type_info& info = typeid(T);
+    PrecisionType ptype = PRECISION(kFloat);
+    if (info == typeid(float)) {
+      ptype = PRECISION(kFloat);
+    } else if (info == typeid(int8_t)) {
+      ptype = PRECISION(kFloat);
+    } else if (info == typeid(int32_t)) {
+      ptype = PRECISION(kInt32);
+    } else {
+      LOG(FATAL) << "[XPU] Unknow data type " << info.name();
+    }
+    if (shape.empty()) {
+      shape = {static_cast<int64_t>(data.size())};
+    } else {
+      int size = 1;
+      for (auto i : shape) {
+        size *= i;
+      }
+      CHECK_EQ(data.size(), size);
+    }
+    Tensor tensor;
+    tensor.Resize(shape);
+    std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
+                reinterpret_cast<const uint8_t*>(data.data()),
+                data.size() * sizeof(T));
+    return AddNode(name, tensor, ptype, ltype);
+  }
+  template <typename T>
+  std::shared_ptr<xtcl::xExpr> AddNode(
+      const std::string& name,
+      T value,
+      std::vector<int64_t> shape = {1},
+      DataLayoutType ltype = DATALAYOUT(kNCHW)) {
+    int64_t size = 1;
+    for (auto i : shape) {
+      size *= i;
+    }
+    std::vector<T> data(size, value);
+    return AddNode(name, data, shape, ltype);
+  }
+  // Data node
+  std::shared_ptr<xtcl::xExpr> AddNode(
+      const std::string& name,
+      std::vector<int64_t> shape,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+  std::shared_ptr<xtcl::xExpr> GetNode(const std::string& name) {
+    CHECK(HasNode(name)) << "[XPU] Node " << name << " not found.";
+    return nodes_.at(name);
+  }
+  bool HasNode(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+ public:
+  // XPU network builder and constant tensors
+  xtcl::network::xNetworkBuilder builder_;
+  xtcl::network::xTensorCompiler::ParamNDArrayMap params_;
+ private:
+  std::unordered_map<std::string, std::shared_ptr<xtcl::xExpr>> nodes_;
+  std::unordered_map<std::string, int> counts_;
+};
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/bridges/mul_op.cc
+++ b/lite/kernels/xpu/bridges/mul_op.cc
@@ -12,34 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
-node_map_type MulConverter(const std::shared_ptr<lite::OpLite> op,
+int MulConverter(void* ctx, OpLite* op) {
-                           graph_ctx_type* graph_ctx,
+  CHECK(ctx != nullptr);
-                           const node_map_type& input_nodes) {
+  CHECK(op != nullptr);
-  auto scope = op->scope();
+  auto graph = static_cast<Graph*>(ctx);
  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  auto scope = op->scope();
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-  // get input, and attributes
+  // Get input, and attributes
  auto x_var_name = op_info->Input("X").front();
  auto y_var_name = op_info->Input("Y").front();
-  auto y_tensor = scope->FindMutableTensor(y_var_name);
+  auto out_var_name = op_info->Output("Out").front();
-  auto y_dims = y_tensor->dims();
+  auto y = scope->FindMutableTensor(y_var_name);
+  auto y_dims = y->dims();
  CHECK_EQ(y_dims.size(), 2) << "xpu now only support y_dims.size() == 2";
  auto x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
@@ -47,54 +43,38 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> op,
  auto y_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
  CHECK_EQ(y_num_col_dims, 1) << "xpu now only support y_num_col_dims == 1";
-  // create x node
+  // Flatten x node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
+  auto x_node = graph->AddNode(
-  x_node = std::make_shared<xtcl::xExpr>(
+      x_var_name + "/flatten",
-      graph_ctx->builder->CreateBatchFlatten(*input_nodes.at(x_var_name)));
+      graph->builder_.CreateBatchFlatten(*graph->GetNode(x_var_name)));
-  graph_ctx->builder->SetLayer(unique_op_type + "/X");
-  // transpose y
+  // Transpose y data and create y node
-  DDimLite y_dims_t(std::vector<int64_t>{1, 1});
+  Tensor transpose_y;
-  y_dims_t[0] = y_dims[1];
+  DDim transpose_y_dims(std::vector<int64_t>{y_dims[1], y_dims[0]});
-  y_dims_t[1] = y_dims[0];
+  transpose_y.Resize(transpose_y_dims);
-  auto y_var_name_t = unique_op_type + "/Y";
+  auto transpose_y_data = transpose_y.mutable_data<float>();
-  Tensor* y_tensor_t = new Tensor();
+  auto y_data = y->mutable_data<float>();
-  y_tensor_t->Resize(y_dims_t);
+  for (int i = 0; i < transpose_y_dims[0]; i++) {
-  auto y_data_t = y_tensor_t->mutable_data<float>();
+    for (int j = 0; j < transpose_y_dims[1]; j++) {
-  auto y_data = y_tensor->mutable_data<float>();
+      transpose_y_data[i * transpose_y_dims[1] + j] =
-  for (int i = 0; i < y_dims_t[0]; i++) {
+          y_data[j * transpose_y_dims[0] + i];
-    for (int j = 0; j < y_dims_t[1]; j++) {
-      y_data_t[i * y_dims_t[1] + j] = y_data[j * y_dims_t[0] + i];
    }
  }
+  auto y_const_node = graph->AddNode(y_var_name + "/transpose", transpose_y);
-  // create y node
+  // Create mul node and set params from op
-  std::shared_ptr<xtcl::xExpr> y_const_node = nullptr;
+  graph->AddNode(
-  y_const_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
+      out_var_name,
-      y_var_name_t, lite::xpu::CvtShape(y_dims_t), ::xtcl::Float(32)));
+      graph->builder_.CreateDense(*x_node,
-  auto y_const_tensor = lite::xpu::CvtTensor(y_tensor_t);
+                                  static_cast<int>(y_dims[1]),
-  graph_ctx->params->emplace(std::make_pair(y_var_name_t, *y_const_tensor));
+                                  ::xtcl::NullValue<::xtcl::DataType>(),
-  delete y_tensor_t;
+                                  *y_const_node));
+  return REBUILD_WHEN_SHAPE_CHANGED;
-  // create mul node and set params from op
-  std::shared_ptr<xtcl::xExpr> mul_node = nullptr;
-  mul_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateDense(*x_node,
-                                      static_cast<int>(y_dims[1]),
-                                      ::xtcl::NullValue<::xtcl::DataType>(),
-                                      *y_const_node));
-  graph_ctx->builder->SetLayer(unique_op_type);
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = mul_node;
-  return output_nodes;
 }
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_XPU_BRIDGE(mul, paddle::lite::kernels::xpu::bridges::MulConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU, mul, paddle::lite::subgraph::xpu::MulConverter);
--- a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h
+++ b/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h
@@ -14,13 +14,11 @@
 #pragma once
-#include "lite/kernels/xpu/bridges/registry.h"
+USE_SUBGRAPH_BRIDGE(XPU, relu);
+USE_SUBGRAPH_BRIDGE(XPU, conv2d);
-USE_XPU_BRIDGE(relu);
+USE_SUBGRAPH_BRIDGE(XPU, depthwise_conv2d);
-USE_XPU_BRIDGE(conv2d);
+USE_SUBGRAPH_BRIDGE(XPU, elementwise_add);
-USE_XPU_BRIDGE(depthwise_conv2d);
+USE_SUBGRAPH_BRIDGE(XPU, pool2d);
-USE_XPU_BRIDGE(elementwise_add);
+USE_SUBGRAPH_BRIDGE(XPU, softmax);
-USE_XPU_BRIDGE(pool2d);
+USE_SUBGRAPH_BRIDGE(XPU, mul);
-USE_XPU_BRIDGE(softmax);
+USE_SUBGRAPH_BRIDGE(XPU, batch_norm);
-USE_XPU_BRIDGE(mul);
-USE_XPU_BRIDGE(batch_norm);
--- a/lite/kernels/xpu/bridges/pool_op.cc
+++ b/lite/kernels/xpu/bridges/pool_op.cc
@@ -12,30 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
-node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> op,
+int PoolConverter(void* ctx, OpLite* op) {
-                            graph_ctx_type* graph_ctx,
+  CHECK(ctx != nullptr);
-                            const node_map_type& input_nodes) {
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
-  // check context
+  // Get input, and attributes
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-  // get input, and attributes
  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
  auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
@@ -44,54 +40,51 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> op,
  auto strides = op_info->GetAttr<std::vector<int>>("strides");
  auto exclusive = op_info->GetAttr<bool>("exclusive");
-  // create pool node and set params from op
+  // Create pool node and set params from op
-  CHECK(input_nodes.count(x_var_name));
-  std::shared_ptr<xtcl::xExpr> pool_node = nullptr;
  if (pooling_type == "max") {
    if (global_pooling) {
-      pool_node = std::make_shared<xtcl::xExpr>(
+      graph->AddNode(
-          graph_ctx->builder->CreateGlobalMaxPool2D(
+          out_var_name,
-              *input_nodes.at(x_var_name)));
+          graph->builder_.CreateGlobalMaxPool2D(*graph->GetNode(x_var_name)));
    } else {
-      pool_node = std::make_shared<xtcl::xExpr>(
+      graph->AddNode(
-          graph_ctx->builder->CreateMaxPool2D(*input_nodes.at(x_var_name),
+          out_var_name,
-                                              lite::xpu::CvtShape(ksize),
+          graph->builder_.CreateMaxPool2D(*graph->GetNode(x_var_name),
-                                              lite::xpu::CvtShape(strides),
+                                          CvtShape(ksize),
-                                              lite::xpu::CvtShape(paddings),
+                                          CvtShape(strides),
-                                              "NCHW",
+                                          CvtShape(paddings),
-                                              ceil_mode));
+                                          "NCHW",
+                                          ceil_mode));
    }
  } else if (pooling_type == "avg") {
    if (global_pooling) {
-      pool_node = std::make_shared<xtcl::xExpr>(
+      graph->AddNode(
-          graph_ctx->builder->CreateGlobalAvgPool2D(
+          out_var_name,
-              *input_nodes.at(x_var_name)));
+          graph->builder_.CreateGlobalAvgPool2D(*graph->GetNode(x_var_name)));
    } else {
-      pool_node = std::make_shared<xtcl::xExpr>(
+      // !exclusive ---> count_include_pad
-          // !exclusive ---> count_include_pad
+      graph->AddNode(
-          graph_ctx->builder->CreateAvgPool2D(*input_nodes.at(x_var_name),
+          out_var_name,
-                                              lite::xpu::CvtShape(ksize),
+          graph->builder_.CreateAvgPool2D(*graph->GetNode(x_var_name),
-                                              lite::xpu::CvtShape(strides),
+                                          CvtShape(ksize),
-                                              lite::xpu::CvtShape(paddings),
+                                          CvtShape(strides),
-                                              "NCHW",
+                                          CvtShape(paddings),
-                                              ceil_mode,
+                                          "NCHW",
-                                              !exclusive));
+                                          ceil_mode,
+                                          !exclusive));
    }
  } else {
-    LOG(FATAL) << "Unsupported pooling type: " << pooling_type;
+    LOG(WARNING) << "[XPU] Unsupported pooling type: " << pooling_type;
+    return FAILED;
  }
-  graph_ctx->builder->SetLayer(unique_op_type);
+  return SUCCESS;
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = pool_node;
-  return output_nodes;
 }
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_XPU_BRIDGE(pool2d, paddle::lite::kernels::xpu::bridges::PoolConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         pool2d,
+                         paddle::lite::subgraph::xpu::PoolConverter);
--- a/lite/kernels/xpu/bridges/registry.h
+++ b/lite/kernels/xpu/bridges/registry.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <xtcl/xtcl.h>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/utils/macros.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-// xpu network builder and constant tensors
-class graph_ctx_type {
- public:
-  std::shared_ptr<xtcl::network::xNetworkBuilder> builder;
-  std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params;
-};
-// var_name, xpu node pointer
-using node_map_type =
-    std::unordered_map<std::string, std::shared_ptr<xtcl::xExpr>>;
-using func_type = std::function<node_map_type(
-    const std::shared_ptr<OpLite>, graph_ctx_type*, const node_map_type&)>;
-using cvt_map_type = std::unordered_map<std::string, func_type>;
-class Factory {
- public:
-  static Factory& Instance();
-  const cvt_map_type& AllFunctions() const { return map_; }
-  bool HasType(const std::string& op_type) const;
-  void Insert(const std::string& op_type, const func_type& func_name);
-  Factory() = default;
- private:
-  cvt_map_type map_;
-  DISALLOW_COPY_AND_ASSIGN(Factory);
-};
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-// some platform-independent defintion
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
-  struct __test_global_namespace_##uniq_name##__ {};                          \
-  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
-                             __test_global_namespace_##uniq_name##__>::value, \
-                msg)
-#define REGISTER_XPU_BRIDGE(op_type, cvt_func_name)                         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                 \
-      __reg_xpu_bridge_##op_type##__,                                       \
-      "REGISTER_XPU_BRIDGE must be called in global namespace only once!"); \
-  int __reg_xpu_bridge_##op_type##_Insert() {                               \
-    paddle::lite::kernels::xpu::bridges::Factory::Instance().Insert(        \
-        #op_type, cvt_func_name);                                           \
-    return 0;                                                               \
-  }
-#define USE_XPU_BRIDGE(op_type)                                  \
-  extern int __reg_xpu_bridge_##op_type##_Insert();              \
-  static int __reg_xpu_bridge_##op_type##_Insert_return UNUSED = \
-      __reg_xpu_bridge_##op_type##_Insert();
--- a/lite/kernels/xpu/bridges/softmax_op.cc
+++ b/lite/kernels/xpu/bridges/softmax_op.cc
@@ -12,50 +12,40 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
-node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> op,
+int SoftmaxConverter(void* ctx, OpLite* op) {
-                               graph_ctx_type* graph_ctx,
+  CHECK(ctx != nullptr);
-                               const node_map_type& input_nodes) {
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
  auto op_info = op->op_info();
  auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
-  // check context
+  // Get op's attributes
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-  // get op's attributes
  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
  auto axis = op_info->GetAttr<int>("axis");
-  // create softmax node and set params from ops
+  // Create softmax node and set params from ops
-  CHECK(input_nodes.count(x_var_name));
+  graph->AddNode(
-  std::shared_ptr<xtcl::xExpr> softmax_node = nullptr;
+      out_var_name,
-  softmax_node = std::make_shared<xtcl::xExpr>(
+      graph->builder_.CreateSoftmax(*graph->GetNode(x_var_name), axis));
-      graph_ctx->builder->CreateSoftmax(*input_nodes.at(x_var_name), axis));
+  return SUCCESS;
-  graph_ctx->builder->SetLayer(unique_op_type);
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = softmax_node;
-  return output_nodes;
 }
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
-REGISTER_XPU_BRIDGE(softmax,
+REGISTER_SUBGRAPH_BRIDGE(XPU,
-                    paddle::lite::kernels::xpu::bridges::SoftmaxConverter);
+                         softmax,
+                         paddle::lite::subgraph::xpu::SoftmaxConverter);
--- a/lite/backends/xpu/builder.cc
+++ b/lite/backends/xpu/builder.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/xpu/bridges/utility.h"
-#include <mutex>  // NOLINT
 #include <utility>
-#include "lite/backends/xpu/runtime.h"
 namespace paddle {
 namespace lite {
+namespace subgraph {
 namespace xpu {
 bool HasInputArg(const OpInfo* op_info,
@@ -39,20 +38,6 @@ bool HasInputArg(const OpInfo* op_info,
  }
 }
-std::string UniqueName(const std::string& prefix) {
-  static std::mutex counter_mtx;
-  static std::unordered_map<std::string, int> counter_map;
-  std::unique_lock<std::mutex> counter_lck(counter_mtx);
-  int counter = 1;
-  auto it = counter_map.find(prefix);
-  if (it == counter_map.end()) {
-    counter_map[prefix] = counter;
-  } else {
-    counter = ++(it->second);
-  }
-  return prefix + "_" + std::to_string(counter);
-}
 xtcl::DataType CvtPrecisionType(PrecisionType in_type) {
  xtcl::DataType out_type = ::xtcl::Float(32);
  switch (in_type) {
@@ -66,8 +51,8 @@ xtcl::DataType CvtPrecisionType(PrecisionType in_type) {
      out_type = ::xtcl::Int(32);
      break;
    default:
-      LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(in_type)
+      LOG(FATAL) << "[XPU] Can not convert precision type("
-                 << ") from Lite to XPU";
+                 << PrecisionToStr(in_type) << ") from Lite to XPU";
      break;
  }
  return out_type;
@@ -86,8 +71,8 @@ DLDataType CvtDataType(PrecisionType in_type) {
      out_type = {kDLInt, 32, 1};
      break;
    default:
-      LOG(FATAL) << "Can not convert data type(" << PrecisionToStr(in_type)
+      LOG(FATAL) << "[XPU] Can not convert data type("
-                 << ") from Lite to XPU";
+                 << PrecisionToStr(in_type) << ") from Lite to XPU";
      break;
  }
  return out_type;
@@ -109,28 +94,28 @@ xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims) {
  return CvtShape(in_dims.Vectorize());
 }
-std::shared_ptr<xtcl::xNDArray> CvtTensor(lite::Tensor* in_tensor,
+std::shared_ptr<xtcl::xNDArray> CvtTensor(const Tensor& in_tensor,
                                          std::vector<int64_t> out_shape,
                                          PrecisionType in_ptype,
                                          DataLayoutType in_ltype) {
-  uint8_t* in_data = nullptr;
+  const uint8_t* in_data = nullptr;
-  auto in_size = in_tensor->dims().production();
+  auto in_size = in_tensor.dims().production();
-  auto in_shape = in_tensor->dims().Vectorize();
+  auto in_shape = in_tensor.dims().Vectorize();
  if (out_shape.empty()) {
    out_shape = in_shape;
  }
  int in_bytes;
  if (in_ptype == PRECISION(kFloat)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<float>());
    in_bytes = in_size * sizeof(float);
  } else if (in_ptype == PRECISION(kInt32)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int32_t>());
    in_bytes = in_size * sizeof(int32_t);
  } else if (in_ptype == PRECISION(kInt8)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int8_t>());
    in_bytes = in_size * sizeof(int8_t);
  } else {
-    LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype);
+    LOG(FATAL) << "[XPU] Unknow precision type " << PrecisionToStr(in_ptype);
  }
  auto out_tensor = std::make_shared<xtcl::xNDArray>(
      xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0}));
@@ -140,50 +125,7 @@ std::shared_ptr<xtcl::xNDArray> CvtTensor(lite::Tensor* in_tensor,
  return out_tensor;
 }
-// Build the XPU subgraph to the XPU model, store the model data into the
-// weight tensor of the graph op, and the model data will be loaded again
-// by the graph computing kernel when the graph op is executed for inference.
-// Due to the lack of XPU APIs for building and outputing the model data,
-// the compiled XPU runtime object will be managed by the global variable
-// 'DeviceInfo' and the key name for finding the runtime object will be
-// stored in the weight tensor of graph op.
-// TODO(hong19860320) Compile the XPU subgraph and output the compiled model
-// data to the weight tensor of graph op.
-bool BuildModel(
-    std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
-    std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
-    std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
-    lite::Tensor* model) {
-  LOG(INFO) << "[XPU] Build Model.";
-  CHECK(builder != nullptr);
-  CHECK(outputs != nullptr);
-  CHECK_GT(outputs->size(), 0);
-  CHECK(model != nullptr);
-  // build graph and fill all of constant params
-  xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0]));
-  auto target = xtcl::Target::Create("llvm");
-  auto compiler = xtcl::network::xTensorCompiler(network, target);
-  compiler.SetParams(*params);  // set the data of constant tensors
-  compiler.Build();
-  // create and register runtime
-  auto runtime = std::make_shared<xtcl::network::xRuntimeInstance>(
-      compiler.CreateRuntimeInstance());
-  if (runtime == nullptr) {
-    LOG(WARNING) << "[XPU] Build Model failed!";
-    return false;
-  }
-  std::string name = UniqueName("xpu");
-  LOG(INFO) << "[XPU] Model Name: " << name;
-  DeviceInfo::Global().Insert(name, runtime);
-  model->Resize({static_cast<int64_t>(name.length() + 1)});
-  memcpy(model->mutable_data<int8_t>(),
-         reinterpret_cast<const int8_t*>(name.c_str()),
-         name.length() + 1);
-  return true;
-}
 }  // namespace xpu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/xpu/builder.h
+++ b/lite/backends/xpu/builder.h
@@ -17,22 +17,20 @@
 #include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 #include "lite/core/op_lite.h"
-#include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
 namespace paddle {
 namespace lite {
+namespace subgraph {
 namespace xpu {
+// Type/tensor converters for converting Paddle type/tensor to XPU type/tensor
 bool HasInputArg(const OpInfo* op_info,
                 const Scope* scope,
                 const std::string& argname);
-std::string UniqueName(const std::string& prefix);
 xtcl::DataType CvtPrecisionType(PrecisionType in_type);
 DLDataType CvtDataType(PrecisionType in_type);
@@ -44,17 +42,12 @@ xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape);
 xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims);
 std::shared_ptr<xtcl::xNDArray> CvtTensor(
-    Tensor* in_tensor,
+    const Tensor& in_tensor,
    std::vector<int64_t> out_shape = {},
    PrecisionType in_ptype = PRECISION(kFloat),
    DataLayoutType in_ltype = DATALAYOUT(kNCHW));
-bool BuildModel(
-    std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
-    std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
-    std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
-    lite::Tensor* model);
 }  // namespace xpu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/xpu/graph_compute.cc
+++ b/lite/kernels/xpu/graph_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/xpu/graph_compute.h"
-#include <sys/time.h>
-#include <time.h>
-#include <string>
-#include <vector>
-#include "lite/backends/xpu/runtime.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-void GraphCompute::PrepareForRun() {
-  // auto& ctx = this->ctx_->template As<XPUContext>();
-  auto& param = this->Param<param_t>();
-  CHECK(param.weight);
-  CHECK(lite::xpu::LoadModel(*param.weight, &runtime_));
-  CHECK(runtime_ != nullptr);
-}
-void GraphCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  auto start_time = GetCurrentUS();
-  for (int i = 0; i < param.inputs.size(); i++) {
-    auto input_var_name = param.inputs[i].first;
-    auto input_tensor = param.inputs[i].second;
-    LOG(INFO) << "input dims[" << i << ":" << input_var_name
-              << "]: " << input_tensor->dims();
-    auto input_tensor_data = input_tensor->data<float>();
-    for (int j = 0; j < input_tensor->dims().production(); j++) {
-      VLOG(3) << input_tensor_data[j];
-    }
-    auto input_ndarray = xtcl::xNDArray::Empty(
-        input_tensor->dims().Vectorize(), {kDLFloat, 32, 1}, {kDLCPU, 0});
-    auto input_ndarray_data =
-        static_cast<float*>(input_ndarray.ToDLPack()->dl_tensor.data);
-    std::memcpy(input_ndarray_data,
-                input_tensor_data,
-                sizeof(float) * input_tensor->dims().production());
-    runtime_->SetInputZeroCopy(input_var_name,
-                               &input_ndarray.ToDLPack()->dl_tensor);
-  }
-  runtime_->Run();
-  for (int i = 0; i < param.outputs.size(); i++) {
-    auto output_ndarray = runtime_->GetOutput(i);
-    auto output_var_name = param.outputs[i].first;
-    auto output_tensor = param.outputs[i].second;
-    output_tensor->Resize(output_ndarray.Shape());
-    LOG(INFO) << "output dims[" << i << ":" << output_var_name
-              << "]: " << output_tensor->dims();
-    auto output_ndarray_data =
-        static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data);
-    auto output_tensor_data = output_tensor->mutable_data<float>();
-    std::memcpy(output_tensor_data,
-                output_ndarray_data,
-                sizeof(float) * output_tensor->dims().production());
-    for (int j = 0; j < output_tensor->dims().production(); j++) {
-      VLOG(3) << output_tensor_data[j];
-    }
-  }
-  LOG(INFO) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
-}
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-REGISTER_LITE_KERNEL(graph_op,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::GraphCompute,
-                     def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/backends/xpu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+int SubgraphEngine::BuildDeviceProgram() {
+  int status = 0;
+  // Convert all of input data vars and added into the XPU IR graph
+  subgraph::xpu::Graph graph;
+  for (auto& input_name : input_names_) {
+    auto input_tensor = scope_->FindMutableTensor(input_name);
+    CHECK(input_tensor);
+    auto input_node =
+        graph.AddNode(input_name, input_tensor->dims().Vectorize());
+    CHECK(input_node);
+    // XTCL doesn't support dynamic dimensions/shapes, so need to rebuild
+    // the program when the shape of any input tensor is changed.
+    status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
+  }
+  // Convert all of ops and its weights and added into the XPU IR graph
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = inst.op();
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists("XPU", op_type)) {
+      return subgraph::FAILED;
+    }
+    status |= bridges.Select("XPU", op_type)(reinterpret_cast<void*>(&graph),
+                                             const_cast<OpLite*>(op));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  // Obtain the output nodes of the XPU IR graph and build the graph to XPU
+  // runtime
+  std::vector<xtcl::xExpr*> output_nodes;
+  for (auto& output_name : output_names_) {
+    output_nodes.push_back(graph.GetNode(output_name).get());
+  }
+  device_program_ = lite::xpu::Device::Global().Build(
+      &graph.builder_, &graph.params_, &output_nodes);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[XPU] Build model failed!";
+    return subgraph::FAILED;
+  }
+  // Query and check the dimensions of input and output tensors
+  origin_idims_.resize(input_names_.size());
+  origin_itensors_.resize(input_names_.size());
+  origin_odims_.resize(output_names_.size());
+  origin_otensors_.resize(output_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+    VLOG(3) << "[XPU] Input dims[" << i << "]: " << origin_idims_[i];
+  }
+  for (int i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    VLOG(3) << "[XPU] Output dims[" << i << "]: " << origin_odims_[i];
+  }
+  return status;
+}
+int SubgraphEngine::LaunchDeviceProgram() {
+  // Copy the data of origin input tensors to the buffer of input XPU tensors
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    auto input_ndarray =
+        xtcl::xNDArray::Empty(origin_itensors_[i]->dims().Vectorize(),
+                              {kDLFloat, 32, 1},
+                              {kDLCPU, 0});
+    std::memcpy(static_cast<float*>(input_ndarray.ToDLPack()->dl_tensor.data),
+                origin_itensors_[i]->mutable_data<float>(),
+                sizeof(float) * origin_itensors_[i]->dims().production());
+    device_program_->SetInputZeroCopy(input_names_[i],
+                                      &input_ndarray.ToDLPack()->dl_tensor);
+  }
+  // Run the XPU model
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  auto start_time = GetCurrentUS();
+  device_program_->Run();
+  VLOG(3) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
+  // Copy the data of output XPU tensor to the buffer of origin output tensors
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    auto output_ndarray = device_program_->GetOutput(i);
+    std::memcpy(origin_otensors_[i]->mutable_data<float>(),
+                static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data),
+                sizeof(float) * origin_otensors_[i]->dims().production());
+  }
+  return 0;
+}
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Launch();
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(subgraph,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
--- a/lite/kernels/npu/graph_compute.h
+++ b/lite/kernels/npu/graph_compute.h
@@ -14,41 +14,51 @@
 #pragma once
+#include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
 #include <vector>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
 #include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/engine.h"
-#include "lite/core/types.h"
+#include "lite/kernels/npu/bridges/registry.h"
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace npu {
+namespace xpu {
-class GraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> {
+class SubgraphEngine : public subgraph::Engine {
 public:
-  using param_t = operators::GraphParam;
+  SubgraphEngine(int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            block_idx, block_desc, input_names, output_names, scope) {}
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+  std::unique_ptr<xtcl::network::xRuntimeInstance> device_program_{nullptr};
+};
+class SubgraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SubgraphParam;
  void PrepareForRun() override;
  void Run() override;
-  virtual ~GraphCompute() = default;
+  virtual ~SubgraphCompute() = default;
 private:
-  std::shared_ptr<hiai::AiModelMngerClient> model_client_;
+  std::unique_ptr<SubgraphEngine> engine_;
-  std::string model_name_;
-  hiai::AiContext model_context_;
-  std::vector<int64_t> npu_idatasizes_;
-  std::vector<int64_t> npu_odatasizes_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> npu_itensors_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> npu_otensors_;
 };
-}  // namespace npu
+}  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -48,7 +48,7 @@ add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_
 add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
 add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
 add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS})
-add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS})
+add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS})
 # 2.basic ops not used in basic models
 add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
@@ -106,7 +106,6 @@ add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
 add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS})
 add_operator(lookup_table_v2_op extra SRCS lookup_table_v2_op.cc DEPS ${op_DEPS})
 add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
-add_operator(graph_op_lite extra SRCS graph_op.cc DEPS ${op_DEPS})
 add_operator(logical_xor  extra SRCS logical_op.cc DEPS ${op_DEPS})
 add_operator(logical_and  extra SRCS logical_op.cc DEPS ${op_DEPS})
 add_operator(logical_or  extra SRCS logical_op.cc DEPS ${op_DEPS})

--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -70,10 +70,14 @@ struct CalibParam {
  float scale;
 };
-struct GraphParam {
+struct SubgraphParam {
-  std::vector<std::pair<std::string, const lite::Tensor*>> inputs{};
+  std::vector<std::string> input_names{};
-  lite::Tensor* weight{};
+  std::vector<std::string> output_names{};
-  std::vector<std::pair<std::string, lite::Tensor*>> outputs{};
+  std::vector<std::string> input_data_names{};
+  std::vector<std::string> output_data_names{};
+  int sub_block_idx{-1};
+  cpp::BlockDesc* sub_block_desc{nullptr};
+  Scope* scope{nullptr};
 };
 /// -------------------------- NN operators ------------------------------------

--- a/lite/operators/graph_op.cc
+++ b/lite/operators/graph_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/operators/graph_op.h"
+#include "lite/operators/subgraph_op.h"
 #include <utility>
 #include "lite/core/op_registry.h"
@@ -20,34 +20,29 @@ namespace paddle {
 namespace lite {
 namespace operators {
-bool GraphOpLite::CheckShape() const {
+bool SubgraphOp::CheckShape() const { return true; }
-  CHECK_GE_OR_FALSE(param_.inputs.size(), 1UL);
-  CHECK_GE_OR_FALSE(param_.outputs.size(), 1UL);
-  return true;
-}
-bool GraphOpLite::InferShape() const { return CheckShape(); /* enrich me */ }
-bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+bool SubgraphOp::InferShape() const { return CheckShape(); /* enrich me */ }
-  auto inputs = op_desc.Input("Inputs");
-  auto weight = op_desc.Input("Weight");
-  auto outputs = op_desc.Output("Outputs");
-  for (auto var : inputs) {
+bool SubgraphOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
-    CHECK(scope->FindVar(var));
+  param_.input_names = op_desc.Input("Inputs");
-    param_.inputs.push_back(
+  param_.output_names = op_desc.Output("Outputs");
-        std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
+  for (auto& input_name : param_.input_names) {
+    CHECK(scope->FindVar(input_name));
+    scope->FindVar(input_name)->GetMutable<lite::Tensor>();
  }
+  for (auto& output_name : param_.output_names) {
-  param_.weight = scope->FindVar(weight.front())->GetMutable<lite::Tensor>();
+    CHECK(scope->FindVar(output_name));
-  CHECK(param_.weight);
+    scope->FindVar(output_name)->GetMutable<lite::Tensor>();
-  for (auto var : outputs) {
-    CHECK(scope->FindVar(var));
-    param_.outputs.push_back(
-        std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
  }
+  param_.input_data_names =
+      op_desc.GetAttr<std::vector<std::string>>("input_data_names");
+  param_.output_data_names =
+      op_desc.GetAttr<std::vector<std::string>>("output_data_names");
+  CHECK(param_.sub_block_desc);
+  param_.sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  param_.scope = scope;
+  CHECK(param_.scope);
  return true;
 }
@@ -55,4 +50,4 @@ bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
 }  // namespace lite
 }  // namespace paddle
-REGISTER_LITE_OP(graph_op, paddle::lite::operators::GraphOpLite);
+REGISTER_LITE_OP(subgraph, paddle::lite::operators::SubgraphOp);
--- a/lite/operators/graph_op.h
+++ b/lite/operators/graph_op.h
@@ -27,11 +27,11 @@ namespace paddle {
 namespace lite {
 namespace operators {
-class GraphOpLite : public OpLite {
+class SubgraphOp : public OpLite {
 public:
-  GraphOpLite() {}
+  SubgraphOp() {}
-  explicit GraphOpLite(const std::string &type) : OpLite(type) {}
+  explicit SubgraphOp(const std::string &type) : OpLite(type) {}
  bool CheckShape() const override;
@@ -41,10 +41,13 @@ class GraphOpLite : public OpLite {
  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "graph_op"; }
+  std::string DebugString() const override { return "subgraph"; }
+  void SetSubBlock(cpp::BlockDesc *desc) { param_.sub_block_desc = desc; }
+  cpp::BlockDesc *GetSubBlock() { return param_.sub_block_desc; }
 private:
-  mutable GraphParam param_;
+  mutable SubgraphParam param_;
 };
 }  // namespace operators

--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -8,7 +8,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE
    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -44,16 +44,16 @@ if(LITE_BUILD_EXTRA)
    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
-    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -243,38 +243,53 @@ class ActivationComputeTester : public arena::TestCase {
 TEST(Activation_relu, precision) {
  LOG(INFO) << "test relu op";
-#ifdef LITE_WITH_ARM
+  float abs_error = 2e-5;
-  Place place(TARGET(kARM));
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
  for (auto n : {1, 3}) {
    for (auto c : {3, 6}) {
      for (auto h : {9, 18}) {
        for (auto w : {9, 18}) {
-          for (auto slope : {0.01, 0.1}) {
+          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+              place,
-                place,
+              "def",
-                "def",
+              0.01,
-                0.01,
+              6.,
-                6.,
+              "all",
-                "all",
+              0.,
-                0.,
+              DDim(std::vector<int64_t>({n, c, h, w})),
-                DDim(std::vector<int64_t>({n, c, h, w})),
+              "relu",
-                "relu",
+              RELU));
-                RELU));
+          arena::Arena arena(std::move(tester), place, abs_error);
-            arena::Arena arena(std::move(tester), place, 2e-5);
+          arena.TestPrecision();
-            arena.TestPrecision();
-          }
        }
      }
    }
  }
-#endif
 }
 TEST(Activation_leaky_relu, precision) {
  LOG(INFO) << "test leaky_relu op";
-#ifdef LITE_WITH_ARM
+  float abs_error = 2e-5;
-  Place place(TARGET(kARM));
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
  for (auto n : {1, 3}) {
    for (auto c : {3, 6}) {
@@ -291,20 +306,27 @@ TEST(Activation_leaky_relu, precision) {
                DDim(std::vector<int64_t>({n, c, h, w})),
                "leaky_relu",
                LEAKY_RELU));
-            arena::Arena arena(std::move(tester), place, 2e-5);
+            arena::Arena arena(std::move(tester), place, abs_error);
            arena.TestPrecision();
          }
        }
      }
    }
  }
-#endif
 }
 TEST(Activation_relu_clipped, precision) {
  LOG(INFO) << "test relu clipped op";
-#ifdef LITE_WITH_ARM
+  float abs_error = 2e-5;
-  Place place(TARGET(kARM));
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
  for (auto n : {1, 3}) {
    for (auto c : {3, 6}) {
@@ -321,14 +343,13 @@ TEST(Activation_relu_clipped, precision) {
                DDim(std::vector<int64_t>({n, c, h, w})),
                "relu_clipped",
                RELU_CLIPPED));
-            arena::Arena arena(std::move(tester), place, 2e-5);
+            arena::Arena arena(std::move(tester), place, abs_error);
            arena.TestPrecision();
          }
        }
      }
    }
  }
-#endif
 }
 TEST(Activation_prelu, precision) {
@@ -363,8 +384,16 @@ TEST(Activation_prelu, precision) {
 TEST(Activation_sigmoid, precision) {
  LOG(INFO) << "test sigmoid op";
-#ifdef LITE_WITH_ARM
+  float abs_error = 2e-5;
-  Place place(TARGET(kARM));
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
  for (auto n : {1, 3}) {
    for (auto c : {3, 6}) {
@@ -380,19 +409,26 @@ TEST(Activation_sigmoid, precision) {
              DDim(std::vector<int64_t>({n, c, h, w})),
              "sigmoid",
              SIGMOID));
-          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena::Arena arena(std::move(tester), place, abs_error);
          arena.TestPrecision();
        }
      }
    }
  }
-#endif
 }
 TEST(Activation_tanh, precision) {
  LOG(INFO) << "test tanh op";
-#ifdef LITE_WITH_ARM
+  float abs_error = 2e-5;
-  Place place(TARGET(kARM));
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
  for (auto n : {1, 3}) {
    for (auto c : {3, 6}) {
@@ -408,13 +444,12 @@ TEST(Activation_tanh, precision) {
              DDim(std::vector<int64_t>({n, c, h, w})),
              "tanh",
              TANH));
-          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena::Arena arena(std::move(tester), place, abs_error);
          arena.TestPrecision();
        }
      }
    }
  }
-#endif
 }
 TEST(Activation_swish, precision) {

--- a/lite/tools/build_npu.sh
+++ b/lite/tools/build_npu.sh
@@ -7,7 +7,7 @@ ARM_ABI="armv8"                     # armv8, armv7
 ARM_LANG="gcc"                      # gcc only yet
 ANDROID_STL="c++_shared"            # c++_shared/c++_static, c++_shared is used by HiAI DDK 310
 DDK_ROOT="$(pwd)/ai_ddk_lib/"       # HiAI DDK 310 from https://developer.huawei.com/consumer/cn/hiai/
-TARGET_NAME="test_npu_pass"         # default target
+TARGET_NAME="test_subgraph_pass"    # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
 WITH_JAVA=ON                        # ON(build jar and jni so)/OFF
 WITH_TESTING=ON                     # ON/OFF

--- a/lite/tools/build_xpu.sh
+++ b/lite/tools/build_xpu.sh
@@ -3,7 +3,7 @@ set -ex
 # global variables with default value
 XPU_SDK_ROOT="$(pwd)/../XPU_SDK"    # XPU SDK
-TARGET_NAME="lite_compile_deps"     # default target
+TARGET_NAME="test_subgraph_pass"    # default target
 BUILD_EXTRA=ON                      # ON(with sequence ops)/OFF
 WITH_TESTING=ON                     # ON/OFF
@@ -73,8 +73,8 @@ function build_xpu {
        -DWITH_MKLDNN=OFF \
        -DLITE_WITH_X86=ON \
        -DWITH_MKL=ON \
-        -DLITE_BUILD_EXTRA=ON \
        -DLITE_WITH_XPU=ON \
+        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
        -DWITH_TESTING=${WITH_TESTING} \
        -DXPU_SDK_ROOT=${XPU_SDK_ROOT}