[Cherry-pick][Core] Add the graph optimization of subblocks for transformer model (#3947) (#3979)

* [Cherry-pick][Core] Add the graph optimization of subblocks for transformer model (#3947) test=develop * [Core][ARM] Fix beam_search, eltwise_mul supports broadcast and int64_t data type, add print op and kernel, add exeception test=develop * Fix the dims of parent idx of the arm kernel of beam_search op * elementwise_mul supports int64_t data type with broadcasting * Add print op and kernel for debugging * Support throwing the exception when the internal error occurs * Refine while and conditional_block op kernel * Support the graph optimization on subblocks * Pass program_desc and block_idx into the kernel of the control flow ops(while/conditional_block/subgraph), and create the RuntimeProgram online, it make it possiable to call the control flow ops recursively *Add unit test for masked transformer model

[Cherry-pick][Core] Add the graph optimization of subblocks for transformer model (#3947) (#3979)
* [Cherry-pick][Core] Add the graph optimization of subblocks for transformer model (#3947) test=develop * [Core][ARM] Fix beam_search, eltwise_mul supports broadcast and int64_t data type, add print op and kernel, add exeception test=develop * Fix the dims of parent idx of the arm kernel of beam_search op * elementwise_mul supports int64_t data type with broadcasting * Add print op and kernel for debugging * Support throwing the exception when the internal error occurs * Refine while and conditional_block op kernel * Support the graph optimization on subblocks * Pass program_desc and block_idx into the kernel of the control flow ops(while/conditional_block/subgraph), and create the RuntimeProgram online, it make it possiable to call the control flow ops recursively *Add unit test for masked transformer model
c56bf0d8 · hong19860320 · GitHub · 37a01383 · c56bf0d8 · c56bf0d8
92 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OF
 lite_option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
 lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
 lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
+lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF)
 lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
 # publish options

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -190,6 +190,10 @@ if (LITE_WITH_LOG)
  add_definitions("-DLITE_WITH_LOG")
 endif()

+if (LITE_WITH_EXCEPTION)
+  add_definitions("-DLITE_WITH_EXCEPTION")
+endif()
+
 if (LITE_ON_TINY_PUBLISH)
  add_definitions("-DLITE_ON_TINY_PUBLISH")
 endif()

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -80,6 +80,17 @@ if (ARM_TARGET_LANG STREQUAL "clang")
    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
        set(triple arm-v7a-linux-android)
        set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
+        if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+            # Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order
+            set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a")
+            if(ANDROID_STL_TYPE STREQUAL "c++_shared")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so")
+            elseif(ANDROID_STL_TYPE STREQUAL "c++_static")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a")
+            else()
+                message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.")
+            endif()
+        endif()
    else()
        message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
    endif()

--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -23,6 +23,21 @@ if(ANDROID)
    
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
+
+    # Don't re-export libgcc symbols
+    set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a")
+    set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+    set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+
+    # Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM
+    # Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding
+    if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+        set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a")
+        set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+        set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+        set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+    endif()
 endif()

 if(ARMLINUX)
@@ -59,14 +74,13 @@ function(check_linker_flag)
 endfunction()

 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH))
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables")
+else ()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables")
+endif()
 if (LITE_ON_TINY_PUBLISH)
-    if((NOT LITE_WITH_PYTHON))
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
-    endif()
-    if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-    endif()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
    check_linker_flag(-Wl,--gc-sections)
 endif()

--- a/cmake/device/npu.cmake
+++ b/cmake/device/npu.cmake
@@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
  NO_DEFAULT_PATH)

+# Added in HiAI DDK 320 or later version
+find_library(NPU_DDK_HCL_FILE NAMES hcl
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)
+
 if(NOT NPU_DDK_HIAI_FILE)
  message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
 else()
@@ -78,5 +83,13 @@ else()
  set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
 endif()

-set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
+if(NOT NPU_DDK_HCL_FILE)
+# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}")
+else()
+  message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}")
+  add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE})
+endif()
+
+set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs")
 set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -46,6 +46,7 @@ if (WITH_TESTING)
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
    endif()
    if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -31,14 +31,13 @@ void Predictor::SaveModel(const std::string &dir,
  if (!program_) {
    GenRuntimeProgram();
  }
-  program_->SaveOpInfosToProgram(&program_desc_);
-  program_->UpdateVarsOfProgram(&program_desc_);
+  program_->SaveToProgram(program_desc_);
  switch (model_type) {
    case lite_api::LiteModelType::kProtobuf:
-      SaveModelPb(dir, *program_->exec_scope(), program_desc_, true);
+      SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true);
      break;
    case lite_api::LiteModelType::kNaiveBuffer:
-      SaveModelNaive(dir, *program_->exec_scope(), program_desc_);
+      SaveModelNaive(dir, *program_->exec_scope(), *program_desc_.get());
      break;
    default:
      LOG(FATAL) << "Unknown model type";
@@ -52,17 +51,21 @@ void Predictor::SaveModel(const std::string &dir,
 void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
  std::set<std::string> ops_info;
  std::set<std::string> kernels_info;
-  const auto &instructions_ = program_->instructions();
-  for (auto &node : instructions_) {
-    // parse op type infomation
-    auto op = node.op()->op_info();
-    ops_info.insert(op->Type());
-    // parse kernel type information
-    std::string kernel_type_str =
-        node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
-        "," + PrecisionRepr(node.kernel()->precision()) + "," +
-        DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
-    kernels_info.insert(kernel_type_str);
+  auto block_size = program_->block_size();
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    const auto &insts = program_->instructions(block_idx);
+    for (auto &inst : insts) {
+      // parse op type infomation
+      auto op = inst.op()->op_info();
+      ops_info.insert(op->Type());
+      // parse kernel type information
+      std::string kernel_type_str =
+          inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) +
+          "," + PrecisionRepr(inst.kernel()->precision()) + "," +
+          DataLayoutRepr(inst.kernel()->layout()) + "," +
+          inst.kernel()->alias();
+      kernels_info.insert(kernel_type_str);
+    }
  }

  // get souce_file name from op type and kernel type
@@ -164,9 +167,9 @@ void Predictor::PrepareFeedFetch() {

  std::vector<const cpp::OpDesc *> feeds;
  std::vector<const cpp::OpDesc *> fetchs;
-  const auto &insts = program_->instructions();
-  for (size_t i = 0; i < program_->num_instructions(); i++) {
-    const auto &op = insts[i].op()->op_info();
+  const auto &insts = program_->instructions(kRootBlockIdx);
+  for (auto &inst : insts) {
+    const auto &op = inst.op()->op_info();
    if (op->Type() == "feed") {
      feeds.push_back(op);
    } else if (op->Type() == "fetch") {
@@ -232,7 +235,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
 #endif

 const cpp::ProgramDesc &Predictor::program_desc() const {
-  return program_desc_;
+  return *program_desc_.get();
 }

 const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
@@ -250,7 +253,6 @@ void Predictor::Build(const lite_api::CxxConfig &config,
  } else {
    LOG(INFO) << "Load model from file.";
  }
-
  Build(model_path,
        model_file,
        param_file,
@@ -276,14 +278,14 @@ void Predictor::Build(const std::string &model_path,
                  model_file,
                  param_file,
                  scope_.get(),
-                  &program_desc_,
+                  program_desc_.get(),
                  combined_param,
                  model_from_memory);
    } break;
    case lite_api::LiteModelType::kNaiveBuffer:
      CHECK(!model_path.empty())
          << "NaiveBuffer backend only supported combined param";
-      LoadModelNaiveFromFile(model_path, scope_.get(), &program_desc_);
+      LoadModelNaiveFromFile(model_path, scope_.get(), program_desc_.get());
      break;
    default:
      LOG(FATAL) << "Unknown model type";
@@ -291,10 +293,10 @@ void Predictor::Build(const std::string &model_path,
  Build(program_desc_, valid_places, passes);
 }

-void Predictor::Build(const cpp::ProgramDesc &desc,
+void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
                      const std::vector<Place> &valid_places,
                      const std::vector<std::string> &passes) {
-  program_desc_ = desc;
+  program_desc_ = program_desc;
  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
  for (auto &valid_place : valid_places) {
@@ -313,9 +315,9 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
      "fake_dequantize_max_abs",
      "fake_channel_wise_dequantize_max_abs"};
  bool is_quantized_model = false;
-  for (size_t i = 0; i < program_desc_.BlocksSize() && !is_quantized_model;
+  for (size_t i = 0; i < program_desc_->BlocksSize() && !is_quantized_model;
       ++i) {
-    auto *block_desc = program_desc_.GetBlock<cpp::BlockDesc>(i);
+    auto *block_desc = program_desc_->GetBlock<cpp::BlockDesc>(i);
    for (size_t j = 0; j < block_desc->OpsSize() && !is_quantized_model; ++j) {
      auto *op_desc = block_desc->GetOp<cpp::OpDesc>(j);
      std::string op_type = op_desc->Type();
@@ -331,7 +333,7 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
                        Place{TARGET(kARM), PRECISION(kInt8)});
  }

-  Program program(desc, scope_, inner_places);
+  Program program(program_desc_, scope_, inner_places);

  core::KernelPickFactor factor;
  factor.ConsiderTarget();

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -42,7 +42,10 @@ static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
 class LITE_API Predictor {
 public:
  // Create an empty predictor.
-  Predictor() { scope_ = std::make_shared<Scope>(); }
+  Predictor() {
+    scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
+  }

  // Create a predictor with the weight variable scope set.
  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
@@ -64,7 +67,7 @@ class LITE_API Predictor {
      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
      bool memory_from_memory = false);

-  void Build(const cpp::ProgramDesc& desc,
+  void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
             const std::vector<Place>& valid_places,
             const std::vector<std::string>& passes = {});

@@ -100,6 +103,7 @@ class LITE_API Predictor {
  // get a const tensor according to its name
  const lite::Tensor* GetTensor(const std::string& name) const;
  const RuntimeProgram& runtime_program() const;
+  Scope* scope() { return scope_.get(); }

  // This method is disabled in mobile, for unnecessary dependencies required.
  void SaveModel(
@@ -119,7 +123,7 @@ class LITE_API Predictor {

 private:
  Optimizer optimizer_;
-  cpp::ProgramDesc program_desc_;
+  std::shared_ptr<cpp::ProgramDesc> program_desc_;
  std::shared_ptr<Scope> scope_;
  const Scope* exec_scope_;
  std::unique_ptr<RuntimeProgram> program_;

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -71,8 +71,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  mode_ = config.power_mode();
  threads_ = config.threads();
 #ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
-      config.subgraph_model_cache_dir());
+      raw_predictor_.scope(), config.subgraph_model_cache_dir());
 #endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -24,16 +24,16 @@ namespace lite {
 void LightPredictor::Build(const std::string& lite_model_file,
                           bool model_from_memory) {
  if (model_from_memory) {
-    LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromMemory(
+        lite_model_file, scope_.get(), program_desc_.get());
  } else {
-    LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromFile(lite_model_file, scope_.get(), program_desc_.get());
  }

  // For weight quantization of post training, load the int8/16 weights
  // for optimized model, and dequant it to fp32.
  DequantizeWeight();
-
-  BuildRuntimeProgram(cpp_program_desc_);
+  BuildRuntimeProgram(program_desc_);
  PrepareFeedFetch();
 }

@@ -45,15 +45,15 @@ void LightPredictor::Build(const std::string& model_dir,
  switch (model_type) {
 #ifndef LITE_ON_TINY_PUBLISH
    case lite_api::LiteModelType::kProtobuf:
-      LoadModelPb(model_dir, "", "", scope_.get(), &cpp_program_desc_);
+      LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
      break;
 #endif
    case lite_api::LiteModelType::kNaiveBuffer: {
      if (model_from_memory) {
        LoadModelNaiveFromMemory(
-            model_buffer, param_buffer, scope_.get(), &cpp_program_desc_);
+            model_buffer, param_buffer, scope_.get(), program_desc_.get());
      } else {
-        LoadModelNaive(model_dir, scope_.get(), &cpp_program_desc_);
+        LoadModelNaive(model_dir, scope_.get(), program_desc_.get());
      }
      break;
    }
@@ -62,7 +62,7 @@ void LightPredictor::Build(const std::string& model_dir,
  }

  DequantizeWeight();
-  BuildRuntimeProgram(cpp_program_desc_);
+  BuildRuntimeProgram(program_desc_);
  PrepareFeedFetch();
 }

@@ -111,15 +111,17 @@ std::vector<std::string> LightPredictor::GetOutputNames() {
 }
 // append the names of inputs and outputs into input_names_ and output_names_
 void LightPredictor::PrepareFeedFetch() {
-  auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
-  std::vector<cpp::OpDesc*> feeds;
-  std::vector<cpp::OpDesc*> fetchs;
-  for (size_t i = 0; i < current_block->OpsSize(); i++) {
-    auto op = current_block->GetOp<cpp::OpDesc>(i);
-    if (op->Type() == "feed") {
-      feeds.push_back(op);
-    } else if (op->Type() == "fetch") {
-      fetchs.push_back(op);
+  std::vector<const cpp::OpDesc*> feeds;
+  std::vector<const cpp::OpDesc*> fetchs;
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
+  auto main_block = program_desc->GetBlock<cpp::BlockDesc>(kRootBlockIdx);
+  auto op_size = main_block->OpsSize();
+  for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
+    auto op_desc = main_block->GetOp<cpp::OpDesc>(op_idx);
+    if (op_desc->Type() == "feed") {
+      feeds.push_back(op_desc);
+    } else if (op_desc->Type() == "fetch") {
+      fetchs.push_back(op_desc);
    }
  }
  input_names_.resize(feeds.size());
@@ -134,54 +136,35 @@ void LightPredictor::PrepareFeedFetch() {
  }
 }

-void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
-  std::vector<Instruction> insts;
-  // 1. Create op first
-  Program program(prog, scope_, {});
-
-// 2. Create Instructs
-#ifdef LITE_WITH_OPENCL
-  using OpenCLContext = Context<TargetType::kOpenCL>;
-  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
-  local_ctx->As<OpenCLContext>().InitOnce();
-#endif
-
-  // Create the kernels of the target places, and filter out the specific
-  // kernel with the target alias.
-  for (auto& op : program.ops()) {
-    auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
-    std::string op_type, alias;
-    Place place;
-    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-    auto kernels = op->CreateKernels({place});
-    // filter out a kernel
-    auto it = std::find_if(
-        kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
-          return it->alias() == alias;
-        });
-    CHECK(it != kernels.end());
-
-#ifdef LITE_WITH_OPENCL
-    if ((*it)->target() == TARGET(kOpenCL)) {
-      std::unique_ptr<KernelContext> ctx(new KernelContext());
-      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
-      (*it)->SetContext(std::move(ctx));
-    } else {
-      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+void LightPredictor::BuildRuntimeProgram(
+    const std::shared_ptr<const cpp::ProgramDesc>& program_desc) {
+  auto* exe_scope = &scope_->NewScope();
+  // Prepare workspace
+  scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
+  scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
+  CHECK(program_desc);
+  auto block_size = program_desc->BlocksSize();
+  CHECK(block_size);
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    auto var_size = block_desc->VarsSize();
+    for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
+      auto var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
+      if (!var_desc->Persistable()) {
+        exe_scope->Var(var_desc->Name());
+      } else {
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue;
+        scope_->Var(var_desc->Name());
+      }
    }
-#else
-    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
-#endif
-
-    insts.emplace_back(op, std::move(*it));
  }
-  program_.reset(new RuntimeProgram(std::move(insts)));
-
-  CHECK(program.exec_scope());
-  program_->set_exec_scope(program.exec_scope());
+  // Only extracting the ops and generate the runtime program from the main
+  // block desc
+  program_.reset(new RuntimeProgram(program_desc, exe_scope, kRootBlockIdx));
 }

 void LightPredictor::DequantizeWeight() {
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
 #define PROCESS_CONV2D_DATA()                                             \
  for (int64_t i = 0; i < ch; ++i) {                                      \
    for (int64_t j = 0; j < offset; ++j) {                                \
@@ -207,10 +190,9 @@ void LightPredictor::DequantizeWeight() {
    }
    return result;
  };
-
  Tensor tmp_tensor;
-  for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
-    auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
+  for (size_t i = 0; i < program_desc->BlocksSize(); i++) {
+    auto* block = program_desc->GetBlock<cpp::BlockDesc>(i);
    for (size_t k = 0; k < block->OpsSize(); ++k) {
      auto* op_desc = block->GetOp<cpp::OpDesc>(k);
      if (is_weight_quantized_op(op_desc)) {

--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -46,6 +46,7 @@ class LITE_API LightPredictor {
  LightPredictor(const std::string& lite_model_file,
                 bool model_from_memory = false) {
    scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
    Build(lite_model_file, model_from_memory);
  }

@@ -57,6 +58,7 @@ class LITE_API LightPredictor {
                 lite_api::LiteModelType model_type =
                     lite_api::LiteModelType::kNaiveBuffer) {
    scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
    Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
  }

@@ -78,6 +80,7 @@ class LITE_API LightPredictor {
  std::vector<std::string> GetInputNames();
  std::vector<std::string> GetOutputNames();
  void PrepareFeedFetch();
+  Scope* scope() { return scope_.get(); }

 private:
  void Build(const std::string& lite_model_file,
@@ -91,14 +94,15 @@ class LITE_API LightPredictor {
      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
      bool model_from_memory = false);

-  void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
+  void BuildRuntimeProgram(
+      const std::shared_ptr<const cpp::ProgramDesc>& program_desc);

  void DequantizeWeight();

 private:
  std::shared_ptr<Scope> scope_;
  std::unique_ptr<RuntimeProgram> program_;
-  cpp::ProgramDesc cpp_program_desc_;
+  std::shared_ptr<cpp::ProgramDesc> program_desc_;
  std::vector<std::string> input_names_;
  std::vector<std::string> output_names_;
 };

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -38,8 +38,10 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  threads_ = config.threads();

 #ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
-      config.subgraph_model_cache_dir());
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
 #endif
 }


--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -53,6 +53,7 @@ USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
 USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);

--- a/lite/backends/arm/math/beam_search.cc
+++ b/lite/backends/arm/math/beam_search.cc
@@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids,
  selected_ids->Resize(dims);
  selected_scores->Resize(dims);
  if (parent_idx) {
-    parent_idx->Resize(dims);
+    parent_idx->Resize({static_cast<int64_t>(num_instances)});
  }
  auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
  auto *selected_scores_data = selected_scores->mutable_data<float>();

--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -747,6 +747,16 @@ void elementwise_mul<int>(const int* dinx,
  }
 }

+template <>
+void elementwise_mul<int64_t>(const int64_t* dinx,
+                              const int64_t* diny,
+                              int64_t* dout,
+                              int num) {
+  for (int i = 0; i < num; i++) {
+    dout[i] = dinx[i] * diny[i];
+  }
+}
+
 template <>
 void elementwise_mul_relu<float>(const float* dinx,
                                 const float* diny,
@@ -801,6 +811,17 @@ void elementwise_mul_relu<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_mul_relu<int64_t>(const int64_t* dinx,
+                                   const int64_t* diny,
+                                   int64_t* dout,
+                                   int num) {
+  for (int i = 0; i < num; i++) {
+    int64_t tmp = dinx[i] * diny[i];
+    dout[i] = tmp > 0 ? tmp : 0;
+  }
+}
+
 template <>
 void elementwise_mul_broadcast<float>(const float* dinx,
                                      const float* diny,
@@ -935,6 +956,29 @@ void elementwise_mul_broadcast<int>(const int* dinx,
  }
 }

+template <>
+void elementwise_mul_broadcast<int64_t>(const int64_t* dinx,
+                                        const int64_t* diny,
+                                        int64_t* dout,
+                                        int batch,
+                                        int channels,
+                                        int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = *dinx_ptr * diny_data;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_mul_relu_broadcast<float>(const float* dinx,
                                           const float* diny,
@@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_mul_relu_broadcast<int64_t>(const int64_t* dinx,
+                                             const int64_t* diny,
+                                             int64_t* dout,
+                                             int batch,
+                                             int channels,
+                                             int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        int64_t tmp = *dinx_ptr * diny_data;
+        *dout_ptr = tmp > 0 ? tmp : 0;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_max<float>(const float* dinx,
                            const float* diny,

--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -32,7 +32,7 @@ std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
  // Check HiAI DDK version
  const char* ddk_version = model_client->GetVersion();
  if (ddk_version) {
-    LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version;
+    VLOG(3) << "[NPU] HiAI DDK version: " << ddk_version;
  } else {
    LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
  }

--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -24,25 +24,27 @@ void TestCase::CreateInstruction() {
  std::shared_ptr<lite::OpLite> op = nullptr;
  if (place_.target == TARGET(kNPU) || place_.target == TARGET(kXPU)) {
    // Create a new block desc to wrap the original op desc
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
    int sub_block_idx = 0;
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
    sub_block_desc->ClearOps();
    sub_block_desc->ClearVars();
-    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
-    *sub_block_op_desc = *op_desc_;
+    auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_op_desc = *op_desc_;
    // Add the block desc into the subgraph op which used to replace the
    // original op
    op_desc_.reset(new cpp::OpDesc());
    op_desc_->SetType("subgraph");
    op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
-    auto in_names = sub_block_op_desc->input_vars();
-    auto out_names = sub_block_op_desc->output_vars();
+    auto in_names = sub_op_desc->input_vars();
+    auto out_names = sub_op_desc->output_vars();
    op_desc_->SetInput("Inputs", in_names);
    op_desc_->SetOutput("Outputs", out_names);
    op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
    op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
    op = LiteOpRegistry::Global().Create(op_desc().Type());
-    static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
+    static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
+        sub_program_desc);
  } else {
    op = LiteOpRegistry::Global().Create(op_desc().Type());
  }
@@ -52,7 +54,7 @@ void TestCase::CreateInstruction() {
  // filter out the target kernel
  CHECK(!kernels.empty()) << "No kernel found for place "
                          << place_.DebugString();
-  auto it = std::remove_if(
+  auto it = std::find_if(
      kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& k) {
        return k->alias() == alias_;
      });
@@ -234,19 +236,6 @@ bool TestCase::CheckPrecision(const std::string& var_name,
  return success;
 }

-TestCase::~TestCase() {
-  if (op_desc_->Type() == "subgraph") {
-    // Release the subblock desc of Subgraph op
-    auto subgraph_op = const_cast<operators::SubgraphOp*>(
-        static_cast<const operators::SubgraphOp*>(instruction_->op()));
-    CHECK(subgraph_op);
-    auto sub_block_desc = subgraph_op->GetSubBlock();
-    if (sub_block_desc) {
-      delete sub_block_desc;
-    }
-  }
-}
-
 }  // namespace arena
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -43,7 +43,7 @@ class TestCase {
      : place_(place), scope_(new Scope), alias_(alias) {
    ctx_ = ContextScheduler::Global().NewContext(place_.target);
  }
-  virtual ~TestCase();
+  virtual ~TestCase() {}

  void Prepare() {
    PrepareScopes();

--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -17,10 +17,6 @@
 namespace paddle {
 namespace lite {

-#ifdef LITE_WITH_NPU
-std::string Context<TargetType::kNPU>::subgraph_model_cache_dir_{""};  // NOLINT
-#endif
-
 #ifdef LITE_WITH_XPU
 thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
 int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -39,6 +39,7 @@
 #include <utility>
 #include <vector>
 #include "lite/core/device_info.h"
+#include "lite/core/scope.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
 #include "lite/utils/all.h"
@@ -86,15 +87,19 @@ class Context<TargetType::kNPU> {
  NPUContext& operator=(const NPUContext& ctx) {}
  std::string name() const { return "NPUContext"; }

-  static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) {
-    subgraph_model_cache_dir_ = subgraph_model_cache_dir;
+  static void SetSubgraphModelCacheDir(Scope* scope,
+                                       std::string subgraph_model_cache_dir) {
+    auto var = scope->Var("SUBGRAPH_MODEL_CACHE_DIR");
+    CHECK(var);
+    auto data = var->GetMutable<std::string>();
+    CHECK(data);
+    *data = subgraph_model_cache_dir;
  }
-  static std::string SubgraphModelCacheDir() {
-    return subgraph_model_cache_dir_;
+  static std::string SubgraphModelCacheDir(Scope* scope) {
+    auto var = scope->FindVar("SUBGRAPH_MODEL_CACHE_DIR");
+    if (!var) return "";
+    return var->Get<std::string>();
  }
-
- private:
-  static std::string subgraph_model_cache_dir_;
 };
 #endif


--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -29,6 +29,7 @@ lite_cc_library(mir_passes
      elimination/identity_dropout_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
      elimination/remove_tf_redundant_ops_pass.cc
+      elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
      static_kernel_pick_pass.cc
      variable_place_inference_pass.cc
      type_target_cast_pass.cc

--- a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
+#include <algorithm>
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+// Remove all of the unused nodes from the contorl flow op and update the inputs
+// and outputs of the op info The unused nodes are defined as the nodes which
+// are only linked to the control flow op nodes but nerver linked to the other
+// op nodes.
+//
+// For example:
+// graph[0]: main block
+//                      in_x
+//             in_f      |   in_z(unused node)
+//                  \    |    /
+//                   \   |   /
+//        in_w ------- while ------- in_y(unused_node)
+//                    /  |
+//                   /   |
+// (unused node)out_y    |
+//                     out_x
+//
+// graph[1]: sub block
+//                     in_x
+//                       |
+//                       |
+//                     conv2d----in_f
+//                       |
+//                       |
+//                      fc ------in_w
+//                       |
+//                       |
+//                     softmax
+//                       |
+//                       |
+//                     out_x
+//
+// After the pass is applied:
+//                      in_x
+//             in_f      |
+//                  \    |
+//                   \   |
+//        in_w ------- while
+//                       |
+//                       |
+//                       |
+//                     out_x
+
+// Remove the var node from var2rm if it is recursively referred to any op in
+// the subblock
+void CollectUnusedInputOutputNodes(
+    int block_idx,
+    std::vector<std::unique_ptr<mir::SSAGraph>>* graphs,
+    const std::unordered_set<std::string>& control_flow_op_types,
+    std::unordered_map<std::string, Node*>* in_vars2rm,
+    std::unordered_map<std::string, Node*>* out_vars2rm) {
+  auto block_size = graphs->size();
+  for (auto& op_node : (*graphs)[block_idx]->StmtTopologicalOrder()) {
+    if (!op_node->IsStmt()) continue;
+    auto op_info = op_node->AsStmt().op_info();
+    auto op_type = op_info->Type();
+    if (control_flow_op_types.count(op_type)) {
+      int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
+      CHECK(block_idx >= 0 && block_idx < block_size);
+      CollectUnusedInputOutputNodes(sub_block_idx,
+                                    graphs,
+                                    control_flow_op_types,
+                                    in_vars2rm,
+                                    out_vars2rm);
+    } else {
+      for (auto& var_node : op_node->inlinks) {
+        auto& var_name = var_node->AsArg().name;
+        if (in_vars2rm->count(var_name)) {
+          in_vars2rm->erase(var_name);
+        }
+      }
+      for (auto& var_node : op_node->outlinks) {
+        auto& var_name = var_node->AsArg().name;
+        // Tensor array may be only used as the output vars in the sublock
+        if (in_vars2rm->count(var_name)) {
+          in_vars2rm->erase(var_name);
+        }
+        if (out_vars2rm->count(var_name)) {
+          out_vars2rm->erase(var_name);
+        }
+      }
+    }
+  }
+}
+
+// Remove the unused var nodes from the graph and update the op_info of the
+// control flow op
+void RemoveNodesFromGraphAndUpdateOpInfo(
+    SSAGraph* graph,
+    Node* op_node,
+    const std::unordered_map<std::string, Node*>& in_vars2rm,
+    const std::unordered_map<std::string, Node*>& out_vars2rm) {
+  auto op_info = op_node->AsStmt().mutable_op_info();
+  auto op_type = op_info->Type();
+  // Unlink the in_vars2rm and out_vars2rm from the control flow op node, and
+  // remove them if nerver used.
+  for (auto& var_node : in_vars2rm) {
+    VLOG(3) << "in var node '" << var_node.first << "' is unlinked to "
+            << op_type;
+    RemoveDirectedLink(var_node.second, op_node);
+  }
+  for (auto& var_node : out_vars2rm) {
+    VLOG(3) << "out var node '" << var_node.first << "' is unlinked from "
+            << op_type;
+    RemoveDirectedLink(op_node, var_node.second);
+    // Unlink from all of the out op nodes.
+    std::unordered_set<Node*> out_op_nodes;
+    for (auto* out_op_node : var_node.second->outlinks) {
+      if (!out_op_nodes.count(out_op_node)) {
+        out_op_nodes.insert(out_op_node);
+      }
+    }
+    for (auto* out_op_node : out_op_nodes) {
+      RemoveDirectedLink(var_node.second, out_op_node);
+    }
+  }
+  // Remove the unused nodes from the graph if their inlinks and outlinks are
+  // empty
+  std::unordered_set<const Node*> removed_var_nodes;
+  for (auto& var_node : in_vars2rm) {
+    if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
+        !removed_var_nodes.count(var_node.second)) {
+      removed_var_nodes.insert(var_node.second);
+      graph->RemoveNode(var_node.second);
+      VLOG(3) << "in var node " << var_node.first << " is removed";
+    }
+  }
+  for (auto& var_node : out_vars2rm) {
+    if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
+        !removed_var_nodes.count(var_node.second)) {
+      removed_var_nodes.insert(var_node.second);
+      graph->RemoveNode(var_node.second);
+      VLOG(3) << "out var node " << var_node.first << " is removed";
+    }
+  }
+  // Update the op info of the control flow op
+  for (auto& input : *op_info->mutable_inputs()) {
+    for (auto var = input.second.begin(); var != input.second.end();) {
+      if (in_vars2rm.count(*var)) {
+        var = input.second.erase(var);
+      } else {
+        ++var;
+      }
+    }
+  }
+  for (auto& output : *op_info->mutable_outputs()) {
+    for (auto var = output.second.begin(); var != output.second.end();) {
+      if (out_vars2rm.count(*var)) {
+        var = output.second.erase(var);
+      } else {
+        ++var;
+      }
+    }
+  }
+}
+
+void ControlFlowOpUnusedInputsAndOutputsEliminatePass::SetAllGraphs(
+    std::vector<std::unique_ptr<mir::SSAGraph>>* graphs) {
+  CHECK(graphs && !graphs->empty());
+  graphs_ = graphs;
+}
+
+void ControlFlowOpUnusedInputsAndOutputsEliminatePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  // Remove the unused input and output nodes from the control flow op nodes
+  // Which are only linked to the control flow op nodes but nerver linked to the
+  // other op nodes
+  const std::unordered_set<std::string> control_flow_op_types = {
+      "while", "conditional_block"};
+  auto block_size = graphs_->size();
+  for (auto& op_node : graph->StmtTopologicalOrder()) {
+    if (!op_node->IsStmt()) continue;
+    auto op_info = op_node->AsStmt().mutable_op_info();
+    auto op_type = op_info->Type();
+    if (!control_flow_op_types.count(op_type)) continue;
+    int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
+    CHECK(sub_block_idx >= 0 && sub_block_idx < block_size);
+    // Initialize the unused nodes with all of the input and output nodes
+    std::unordered_map<std::string, Node *> in_vars2rm, out_vars2rm;
+    for (auto* var_node : op_node->inlinks) {
+      auto& var_name = var_node->AsArg().name;
+      if (!in_vars2rm.count(var_name)) {
+        in_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
+      }
+    }
+    for (auto* var_node : op_node->outlinks) {
+      auto& var_name = var_node->AsArg().name;
+      if (!out_vars2rm.count(var_name)) {
+        out_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
+      }
+    }
+    // Remove the nodes which used in subblock recursively, and the remaining
+    // nodes are the unused one.
+    CollectUnusedInputOutputNodes(sub_block_idx,
+                                  graphs_,
+                                  control_flow_op_types,
+                                  &in_vars2rm,
+                                  &out_vars2rm);
+    if (in_vars2rm.size() > 0 || out_vars2rm.size() > 0) {
+      // Remove the unused nodes from graph, and update the op info of the
+      // control flow op
+      RemoveNodesFromGraphAndUpdateOpInfo(
+          graph.get(), op_node, in_vars2rm, out_vars2rm);
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(
+    control_flow_op_unused_inputs_and_outputs_eliminate_pass,
+    paddle::lite::mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass)
+    .BindTargets({TARGET(kNPU)});
--- a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ControlFlowOpUnusedInputsAndOutputsEliminatePass : public mir::StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph> &graph) override;
+  void SetAllGraphs(std::vector<std::unique_ptr<mir::SSAGraph>> *graphs);
+
+ private:
+  std::vector<std::unique_ptr<mir::SSAGraph>> *graphs_;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -412,10 +412,10 @@ class XPUSingleEncoderFuser : public FuseBase {
    op_desc.SetAttr<std::string>("act_type", act_type_);

    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    auto* single_encoder_stmt = matched.at("q_mul")->stmt();
    fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());

--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
@@ -315,10 +315,10 @@ class XPUResNetBlock0Fuser : public FuseBase {
    auto block0_stmt = matched.at("left_conv1")->stmt();
    // block0_stmt->ResetOp(op_desc, graph->valid_places());
    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
    block0_stmt->SetOp(fake_subgraph_op);
@@ -577,10 +577,10 @@ class XPUResNetBlock1Fuser : public FuseBase {

    auto block1_stmt = matched.at("right_conv1")->stmt();
    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
    block1_stmt->SetOp(fake_subgraph_op);

--- a/lite/core/mir/generate_program_pass.cc
+++ b/lite/core/mir/generate_program_pass.cc
@@ -39,6 +39,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
    nodes_in_order = graph->StmtTopologicalOrder();
  }

+  insts_.emplace_back();
  for (auto& item : nodes_in_order) {
    if (item->IsStmt()) {
      auto& stmt = item->AsStmt();
@@ -57,7 +58,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
            .SetSyncStreams(stmt.sync_streams_);
      }
 #endif
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
+      insts_.back().emplace_back(stmt.op(), std::move(stmt.kernels().front()));
    }
  }
 }

--- a/lite/core/mir/generate_program_pass.h
+++ b/lite/core/mir/generate_program_pass.h
@@ -42,7 +42,7 @@ class GenerateProgramPass : public ProgramPass {
  }

 private:
-  std::vector<Instruction> insts_;
+  std::vector<std::vector<Instruction>> insts_;
 };

 }  // namespace mir

--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -259,13 +259,19 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
                head_node->AsArg().name,
                cur_node->AsArg().name);
  // for subgraph op, modify the BlockDesc
-  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
-                             inst_node->AsStmt().op().get())
-                             ->GetSubBlock();
-  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
-    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
-    UpdateInputTo(
-        sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       ++sub_op_idx) {
+    auto* sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
+    UpdateInputTo(sub_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
  }

  // recreate the op
@@ -408,21 +414,27 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
                 tail_node->AsArg().name,
                 cur_node->AsArg().name);
  // for subgraph op, modify the BlockDesc
-  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
-                             inst_node->AsStmt().op().get())
-                             ->GetSubBlock();
-  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
-    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       ++sub_op_idx) {
+    auto* sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
    UpdateOutputTo(
-        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+        sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
    /* graph like this
     *        subgraph_op_0
     *          /       \
     *         /         \
     * subgraph_op_1   host_op
     */
-    UpdateInputTo(
-        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+    UpdateInputTo(sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
  }

  // recreate the op
@@ -446,15 +458,22 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
  }
 }

-bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
-  auto* block_desc =
-      static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
-          ->GetSubBlock();
-  for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
-    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
-    CHECK(op_desc);
-    if (op_desc->Type() == "conv2d") {
-      for (auto& names : op_desc->inputs()) {
+bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node,
+                                               Node* inst_node) {
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       sub_op_idx++) {
+    auto sub_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx);
+    CHECK(sub_op_desc);
+    if (sub_op_desc->Type() == "conv2d") {
+      for (auto& names : sub_op_desc->inputs()) {
        if (std::find(names.second.begin(),
                      names.second.end(),
                      arg_node->AsArg().name) != names.second.end()) {

--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -151,61 +151,61 @@ Node *SSAGraph::GraphCreateInstructNode(
 }

 void SSAGraph::Build(const Program &program,
-                     const std::vector<Place> &valid_places) {
+                     const std::vector<Place> &valid_places,
+                     int block_idx) {
  CHECK(node_storage_.empty());

-  auto weights_name = program.weights();
-  auto is_weights = [&](const std::string &name) -> bool {
-    auto it = std::find(weights_name.begin(), weights_name.end(), name);
-    if (it == weights_name.end()) return false;
+  auto weights = program.weights();
+  auto is_weight = [&](const std::string &name) -> bool {
+    auto it = std::find(weights.begin(), weights.end(), name);
+    if (it == weights.end()) return false;
    return true;
  };

-  std::unordered_map<std::string, PrecisionType> var_types =
-      program.var_data_type();
-
-  std::unordered_map<std::string, mir::Node *> arg_update_node_map_;
-  for (auto &op : program.ops()) {
+  auto var_type_map = program.var_type_map();
+  std::map<std::string, mir::Node *> arg_update_node_map;
+  for (auto &op : program.ops(block_idx)) {
    VLOG(3) << op->op_info()->Type();
    auto *op_node = GraphCreateInstructNode(op, valid_places);
-    for (const std::string &name : op->op_info()->input_names()) {
+    auto *op_info = op->op_info();
+    const auto &op_type = op_info->Type();
+    for (const auto &var_name : op_info->input_names()) {
      mir::Node *arg_node = nullptr;
-      if (arg_update_node_map_.count(name)) {
-        arg_node = arg_update_node_map_.at(name);
+      if (arg_update_node_map.count(var_name)) {
+        arg_node = arg_update_node_map.at(var_name);
      } else {
        node_storage_.emplace_back();
        arg_node = &node_storage_.back();
-        arg_node->AsArg(name, node_storage_.size() - 1);
-        arg_update_node_map_[name] = arg_node;
+        arg_node->AsArg(var_name, node_storage_.size() - 1);
+        arg_update_node_map[var_name] = arg_node;
      }
-      if (var_types.count(name)) {
+      if (var_type_map.count(var_name)) {
        if (!arg_node->arg()->type) {
-          arg_node->arg()->type = LiteType::GetTensorTy(
-              TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+          arg_node->arg()->type = var_type_map[var_name];
        }
        // Store the original data type of the output tensors for
        // type_precision_cast_pass, to keep the consistency between the
        // output types of original graph and optimized graph's
-        if (op->op_info()->Type() == "fetch") {
+        if (op_type == "fetch") {
          op->mutable_op_info()->SetAttr<int>(
-              "data_type", static_cast<int>(var_types[name]));
+              "data_type",
+              static_cast<int>(var_type_map[var_name]->precision()));
        }
      }
-      if (is_weights(name)) arg_node->AsArg().is_weight = true;
+      if (is_weight(var_name)) arg_node->AsArg().is_weight = true;
      CHECK(arg_node->IsRoleSet());
      DirectedLink(arg_node, op_node);
    }
-    for (const std::string &name : op->op_info()->output_names()) {
+    for (const auto &var_name : op->op_info()->output_names()) {
      node_storage_.emplace_back();
      auto *arg_node = &node_storage_.back();
-      arg_node->AsArg(name, node_storage_.size() - 1);
-      arg_update_node_map_[name] = arg_node;
-      if (var_types.count(name) && !arg_node->arg()->type) {
-        arg_node->arg()->type = LiteType::GetTensorTy(
-            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+      arg_node->AsArg(var_name, node_storage_.size() - 1);
+      arg_update_node_map[var_name] = arg_node;
+      if (var_type_map.count(var_name) && !arg_node->arg()->type) {
+        arg_node->arg()->type = var_type_map[var_name];
      }

-      if (is_weights(name)) arg_node->AsArg().is_weight = true;
+      if (is_weight(var_name)) arg_node->AsArg().is_weight = true;
      CHECK(arg_node->IsRoleSet());
      DirectedLink(op_node, arg_node);
    }

--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
@@ -35,9 +35,13 @@ class GraphBase {};

 class SSAGraph : GraphBase {
 public:
-  // @param program: the op program
+  // @param program: the target program with vars and ops
  // @param valid_places: the valid places user set for the system.
-  void Build(const Program &program, const std::vector<Place> &valid_places);
+  // @param block_idx: the block index in the target program, default is 0(main
+  // block)
+  void Build(const Program &program,
+             const std::vector<Place> &valid_places,
+             int block_idx = kRootBlockIdx);
  void RemoveNode(const mir::Node *node);

  std::vector<mir::Node *> StmtTopologicalOrder();

--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -15,7 +15,6 @@
 #include "lite/core/mir/subgraph/subgraph_detector.h"
 #include <memory>
 #include <set>
-#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "lite/core/mir/dot.h"
@@ -46,13 +45,13 @@ std::string SubgraphVisualizer::operator()() {
      "khaki1",       "ivory4",         "sandybrown",     "olivedrab2",
      "turquoise4",   "snow3",          "sienna4",        "salmon2",
  };
-  std::unordered_map<Node *, int> subgraph_indices;
+  std::map<Node *, int> subgraph_indices;
  for (size_t i = 0; i < subgraphs_.size(); i++) {
    for (size_t j = 0; j < subgraphs_[i].size(); j++) {
      subgraph_indices[subgraphs_[i][j]] = i;
    }
  }
-  std::unordered_map<std::string, int> exists_ops;
+  std::map<std::string, int> exists_ops;
  std::set<std::string> exists_args;
  for (auto &node : graph_->StmtTopologicalOrder()) {
    if (!node->IsStmt()) {
@@ -125,9 +124,9 @@ void SubgraphDetector::node_dat_t::UnionFindCombine(node_dat_t *candidate) {
  candidate->union_find_parent = union_find_parent;

  // Obtain the input and output nodes for the combined one
-  std::unordered_set<node_dat_t *> inputs(inlinks.begin(), inlinks.end());
-  std::unordered_set<node_dat_t *> outputs(candidate->outlinks.begin(),
-                                           candidate->outlinks.end());
+  std::set<node_dat_t *> inputs(inlinks.begin(), inlinks.end());
+  std::set<node_dat_t *> outputs(candidate->outlinks.begin(),
+                                 candidate->outlinks.end());
  for (auto *out_node : outlinks) {
    if (out_node != candidate) {
      outputs.insert(out_node);
@@ -185,7 +184,7 @@ void SubgraphDetector::FlexibleDFS(
  for (auto &node : source) {
    stack.push_back(std::pair<const node_dat_t *, bool>(node, false));
  }
-  std::unordered_set<const node_dat_t *> visited;
+  std::set<const node_dat_t *> visited;
  while (!stack.empty()) {
    auto top = stack.back();
    stack.pop_back();
@@ -210,9 +209,9 @@ void SubgraphDetector::FlexibleDFS(
  }
 }

-std::unordered_set<Node *> SubgraphDetector::GetExcludedNodesFromConfigFile() {
+std::set<Node *> SubgraphDetector::GetExcludedNodesFromConfigFile() {
  // get exclude nodes from config file
-  std::unordered_set<Node *> excluded_nodes;
+  std::set<Node *> excluded_nodes;
  std::string config_file_path =
      GetStringFromEnv(SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE);
  if (!IsFileExists(config_file_path)) {
@@ -285,7 +284,7 @@ std::unordered_set<Node *> SubgraphDetector::GetExcludedNodesFromConfigFile() {

 void SubgraphDetector::InitNodes(node_map_t *nodes) {
  // Initialize and mark the subgraph detector nodes based on teller.
-  std::unordered_set<Node *> excluded_nodes = GetExcludedNodesFromConfigFile();
+  std::set<Node *> excluded_nodes = GetExcludedNodesFromConfigFile();
  for (auto &it : *nodes) {
    for (auto &in_node : it.first->inlinks) {
      it.second->inlinks.push_back((*nodes)[in_node]);
@@ -337,7 +336,7 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
    //  then the src and dst nodes can not be fused into one node,
    //  otherwise it can be done.
    while (true) {
-      std::unordered_set<node_dat_t *> contract_nodes;
+      std::set<node_dat_t *> contract_nodes;
      for (auto *out_node : node->outlinks) {
        // must be an candidate
        if (!out_node->marked) continue;
@@ -372,7 +371,7 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
    }
  }

-  std::unordered_map<node_dat_t * /*ancestor*/, std::vector<Node *>> clusters;
+  std::map<node_dat_t * /*ancestor*/, std::vector<Node *>> clusters;
  for (auto &node : graph_->StmtTopologicalOrder()) {
    if (!node->IsStmt()) continue;
    if ((*nodes)[node]->marked) {
@@ -412,25 +411,26 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
  cpp::OpDesc subgraph_op_desc;
  subgraph_op_desc.SetType("subgraph");

-  // Create a new sub block desc for storing all of Ops and Vars of the target
-  // subgraph and sub_block_idx is set as a attribute of subgraph op,
-  // sub_block_idx < 0 means it's a new subgraph op
-  int sub_block_idx = -(subgraph_idx + 1);
-  auto sub_block_desc = new cpp::BlockDesc();
+  // Create a program desc and a block desc for storing all of Ops and Vars of
+  // the target subgraph and sub_block_idx is set as a attribute of subgraph op,
+  // sub_block_idx = 0 means it's a new subgraph op
+  auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+  int sub_block_idx = 0;
+  auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
  sub_block_desc->ClearOps();
  sub_block_desc->ClearVars();
  for (auto &op_node : subgraph_nodes) {
-    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
-    *sub_block_op_desc = *op_node->AsStmt().op_info();
+    auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_op_desc = *op_node->AsStmt().op_info();
  }
  subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);

  // Extract input and output nodes from the target subgraph
-  std::unordered_set<Node *> idata_var_nodes;
-  std::unordered_set<Node *> weight_var_nodes;
-  std::unordered_set<Node *> odata_var_nodes;
-  std::unordered_set<Node *> local_var_nodes;
-  std::unordered_set<Node *> unused_var_nodes;
+  std::set<Node *> idata_var_nodes;
+  std::set<Node *> weight_var_nodes;
+  std::set<Node *> odata_var_nodes;
+  std::set<Node *> local_var_nodes;
+  std::set<Node *> unused_var_nodes;
  ExtractInputsOutputs(subgraph_nodes,
                       &idata_var_nodes,
                       &weight_var_nodes,
@@ -438,13 +438,13 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
                       &local_var_nodes,
                       &unused_var_nodes);
  // A simplified model without the original weight/local/unused nodes on the
-  // subgraph ops will be saved only if 'SUBGRAPH_DISABLE_ONLINE_MODE' is set to
-  // true and Predictor->Run(...), Predictor->Save(...) is called.
-  std::unordered_set<Node *> input_var_nodes(idata_var_nodes.begin(),
-                                             idata_var_nodes.end());
-  std::unordered_set<Node *> output_var_nodes(odata_var_nodes.begin(),
-                                              odata_var_nodes.end());
-  if (!GetBoolFromEnv(SUBGRAPH_DISABLE_ONLINE_MODE)) {
+  // subgraph ops will be saved only if 'SUBGRAPH_ONLINE_MODE' is set to
+  // true(default) and Predictor->Run(...), Predictor->Save(...) is called.
+  std::set<Node *> input_var_nodes(idata_var_nodes.begin(),
+                                   idata_var_nodes.end());
+  std::set<Node *> output_var_nodes(odata_var_nodes.begin(),
+                                    odata_var_nodes.end());
+  if (GetBoolFromEnv(SUBGRAPH_ONLINE_MODE, true)) {
    input_var_nodes.insert(weight_var_nodes.begin(), weight_var_nodes.end());
    output_var_nodes.insert(local_var_nodes.begin(), local_var_nodes.end());
    output_var_nodes.insert(unused_var_nodes.begin(), unused_var_nodes.end());
@@ -477,7 +477,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
  subgraph_op_desc.SetOutput("Outputs", output_var_names);
  auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
  static_cast<operators::SubgraphOp *>(subgraph_op.get())
-      ->SetSubBlock(sub_block_desc);
+      ->SetProgramDesc(sub_program_desc);
  auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
  subgraph_op->Attach(subgraph_op_desc, any_op->scope());

@@ -516,11 +516,11 @@ void SubgraphFuser::operator()() {
 }

 void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,
-                          std::unordered_set<Node *> *input_var_nodes,
-                          std::unordered_set<Node *> *weight_var_nodes,
-                          std::unordered_set<Node *> *output_var_nodes,
-                          std::unordered_set<Node *> *local_var_nodes,
-                          std::unordered_set<Node *> *unused_var_nodes) {
+                          std::set<Node *> *input_var_nodes,
+                          std::set<Node *> *weight_var_nodes,
+                          std::set<Node *> *output_var_nodes,
+                          std::set<Node *> *local_var_nodes,
+                          std::set<Node *> *unused_var_nodes) {
  for (auto &op_node : op_nodes) {
    for (auto &var_node : op_node->inlinks) {
      if (var_node->AsArg().is_weight) {
@@ -564,7 +564,7 @@ void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,

 std::unordered_set<const Node *> GetNodes2RM(
    const std::vector<Node *> &op_nodes,
-    const std::vector<std::unordered_set<Node *>> &excluded_var_nodes) {
+    const std::vector<std::set<Node *>> &excluded_var_nodes) {
  std::unordered_set<const Node *> nodes2rm(op_nodes.begin(), op_nodes.end());
  for (auto &op_node : op_nodes) {
    for (auto &var_node : op_node->inlinks) {
@@ -600,8 +600,8 @@ std::unordered_set<const Node *> GetNodes2RM(
 }

 static void SortHelper(Node *node,
-                       const std::unordered_set<Node *> &unordered_nodes,
-                       std::unordered_set<const Node *> *visited_nodes,
+                       const std::set<Node *> &unordered_nodes,
+                       std::set<const Node *> *visited_nodes,
                       std::vector<Node *> *ordered_nodes) {
  for (auto &var_node : node->inlinks) {
    if (var_node->inlinks.empty()) continue;
@@ -615,8 +615,8 @@ static void SortHelper(Node *node,
 }

 std::vector<Node *> GetTopologicalOrder(
-    const std::unordered_set<Node *> &unordered_nodes) {
-  std::unordered_set<const Node *> visited_nodes;
+    const std::set<Node *> &unordered_nodes) {
+  std::set<const Node *> visited_nodes;
  std::vector<Node *> ordered_nodes;
  for (auto &node : unordered_nodes) {
    if (!node->IsStmt()) continue;

--- a/lite/core/mir/subgraph/subgraph_detector.h
+++ b/lite/core/mir/subgraph/subgraph_detector.h
@@ -16,8 +16,8 @@

 #include <map>
 #include <memory>
+#include <set>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "lite/core/mir/pass.h"
@@ -51,7 +51,7 @@ class SubgraphDetector {
  // pointer of the Node. This is to avoid changing the original graph in the
  // process of graph analysis.
  struct node_dat_t;
-  using node_map_t = std::unordered_map<Node*, node_dat_t*>;
+  using node_map_t = std::map<Node*, node_dat_t*>;
  using node_set_t = std::vector<node_dat_t*>;
  struct node_dat_t {
    explicit node_dat_t(Node* _node) : node(_node) {}
@@ -73,7 +73,7 @@ class SubgraphDetector {
                   const std::function<bool(const node_dat_t*)>& enter,
                   const std::function<bool(const node_dat_t*)>& leave);

-  std::unordered_set<Node*> GetExcludedNodesFromConfigFile();
+  std::set<Node*> GetExcludedNodesFromConfigFile();

  void InitNodes(node_map_t* nodes);

@@ -114,18 +114,17 @@ class SubgraphFuser {
 };

 void ExtractInputsOutputs(const std::vector<Node*>& op_nodes,
-                          std::unordered_set<Node*>* input_var_nodes,
-                          std::unordered_set<Node*>* weight_var_nodes,
-                          std::unordered_set<Node*>* output_var_nodes,
-                          std::unordered_set<Node*>* local_var_nodes,
-                          std::unordered_set<Node*>* unused_var_nodes);
+                          std::set<Node*>* input_var_nodes,
+                          std::set<Node*>* weight_var_nodes,
+                          std::set<Node*>* output_var_nodes,
+                          std::set<Node*>* local_var_nodes,
+                          std::set<Node*>* unused_var_nodes);

 std::unordered_set<const Node*> GetNodes2RM(
    const std::vector<Node*>& op_nodes,
-    const std::vector<std::unordered_set<Node*>>& excluded_var_nodes);
+    const std::vector<std::set<Node*>>& excluded_var_nodes);

-std::vector<Node*> GetTopologicalOrder(
-    const std::unordered_set<Node*>& unordered_nodes);
+std::vector<Node*> GetTopologicalOrder(const std::set<Node*>& unordered_nodes);

 }  // namespace mir
 }  // namespace lite

--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -141,12 +141,11 @@ std::vector<std::string> AddFetchDesc(
 }

 TEST(Subgraph, detect_simple_model) {
-  cpp::ProgramDesc program_desc;
+  auto program_desc = std::make_shared<cpp::ProgramDesc>();
  std::vector<Place> valid_places{{TARGET(kHost), PRECISION(kFloat)}};
  auto scope = std::make_shared<Scope>();
  // Build a simple network
-  program_desc.ClearBlocks();
-  auto* block_desc = program_desc.AddBlock<cpp::BlockDesc>();
+  auto* block_desc = program_desc->AddBlock<cpp::BlockDesc>();
  block_desc->ClearOps();
  block_desc->ClearVars();
  auto* var_desc = block_desc->AddVar<cpp::VarDesc>();
@@ -181,13 +180,13 @@ TEST(Subgraph, detect_custom_model) {
                 "the path of model files.";
    return;
  }
-  cpp::ProgramDesc program_desc;
+  auto program_desc = std::make_shared<cpp::ProgramDesc>();
  auto scope = std::make_shared<Scope>();
  LoadModelPb(FLAGS_model_dir,
              FLAGS_model_file,
              FLAGS_params_file,
              scope.get(),
-              &program_desc,
+              program_desc.get(),
              !FLAGS_model_file.empty() && !FLAGS_params_file.empty(),
              false);
  std::vector<Place> valid_places({

--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -36,14 +36,20 @@ void UpdateInputsForSubgraph(OpLite* op,
      op_desc->GetAttr<std::vector<std::string>>("input_data_names");
  std::replace(input_data_names.begin(), input_data_names.end(), from, to);
  op_desc->SetAttr("input_data_names", input_data_names);
-  auto* subblock_desc = static_cast<operators::SubgraphOp*>(op)->GetSubBlock();
-  CHECK(subblock_desc);
-  for (size_t i = 0; i < subblock_desc->OpsSize(); i++) {
-    auto* subblock_op_desc = subblock_desc->GetOp<cpp::OpDesc>(i);
-    for (auto& subblock_op_input : *subblock_op_desc->mutable_inputs()) {
-      for (auto& subblock_var_name : subblock_op_input.second) {
-        if (subblock_var_name == from) {
-          subblock_var_name = to;
+  auto sub_program_desc =
+      static_cast<operators::SubgraphOp*>(op)->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx = op_desc->GetAttr<int32_t>("sub_block");
+  auto sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       sub_op_idx++) {
+    auto sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
+    for (auto& sub_op_input : *sub_op_desc->mutable_inputs()) {
+      for (auto& sub_var_name : sub_op_input.second) {
+        if (sub_var_name == from) {
+          sub_var_name = to;
        }
      }
    }

--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -59,22 +59,43 @@ class VariablePlaceInferencePass : public DebugPass {
  }

  // Set the type of the weight
-  void SetWeightType(Node* w,
+  void SetWeightType(Node* weight_node,
                     const LiteType& type,
-                     const std::map<std::string, bool>& lite_with_targets) {
+                     const std::map<std::string, bool>& with_targets) {
    VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
-    if (lite_with_targets.at("kFPGA")) {
-      w->AsArg().type = LiteType::GetTensorTy(
+    if (with_targets.at("kFPGA")) {
+      weight_node->AsArg().type = LiteType::GetTensorTy(
          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-    } else if (lite_with_targets.at("kOpenCL")) {
-      w->AsArg().type = LiteType::GetTensorTy(
+    } else if (with_targets.at("kOpenCL")) {
+      weight_node->AsArg().type = LiteType::GetTensorTy(
          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
    } else {
-      w->AsArg().type = LiteType::GetTensorTy(
+      weight_node->AsArg().type = LiteType::GetTensorTy(
          TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
    }
  }

+  // Update a's kUnk fields from b's fields.
+  void UpdateTypeFrom(const Type** a, const Type* b) {
+    auto target = (*a)->target();
+    auto precision = (*a)->precision();
+    auto layout = (*a)->layout();
+    if (target == TARGET(kUnk)) {
+      target = b->target();
+    }
+    if (precision == PRECISION(kUnk)) {
+      precision = b->precision();
+    }
+    if (layout == DATALAYOUT(kUnk)) {
+      layout = b->layout();
+    }
+    if ((*a)->IsTensor() && b->IsTensor()) {
+      *a = LiteType::GetTensorTy(target, precision, layout);
+    } else if ((*a)->IsTensorList() && b->IsTensorList()) {
+      *a = LiteType::GetTensorListTy(target, precision, layout);
+    }
+  }
+
  void InferenceArgumentPlace(SSAGraph* graph) {
    auto& valid_places = graph->valid_places();
    auto valid_places_has_target = [&](TargetType t) -> bool {
@@ -85,121 +106,89 @@ class VariablePlaceInferencePass : public DebugPass {
      }
      return false;
    };
-    std::map<std::string, bool> lite_with_targets{
+    std::map<std::string, bool> with_targets{
        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
        {"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
-    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
-    VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
+    VLOG(4) << "with_targets['kOpenCL']:" << with_targets["kOpenCL"];
+    VLOG(4) << "with_targets['kFPGA']:" << with_targets["kFPGA"];

    VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
-    for (auto& x : graph->StmtTopologicalOrder()) {
-      auto& inst = x->AsStmt();
+    for (auto& node : graph->StmtTopologicalOrder()) {
+      auto& inst = node->AsStmt();
+      const auto* op_info = inst.op_info();
+      const auto& op_type = op_info->Type();
+      auto& kernel = inst.picked_kernel();
+
      // The IoCopyOp is a tool operator, it won't support the type inference.
      // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
-      // for
-      // tool operator
-      if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
-        VLOG(3) << "inst.op_type() == 'io_copy', continue";
-        if (inst.op_type() == "io_copy") continue;
+      // for tool operator
+      if ((!with_targets["kFPGA"]) && (!with_targets["kOpenCL"])) {
+        VLOG(3) << "skip 'io_copy' if target is FPGA and OpenCL";
+        if (op_type == "io_copy") continue;
      }
-      // deal with inputs
-      VLOG(4) << "Infering op " << inst.op_info()->Repr();
-      // TODO(zhaolong): Add check if the node's name in op's arguments.

-      auto get_argname = [&](
-          const std::string& node_name,
-          const std::map<std::string, std::vector<std::string>>& argname_map)
-          -> std::string {
-            for (auto& ele : argname_map) {
-              auto it =
-                  std::find(ele.second.begin(), ele.second.end(), node_name);
-              if (it != ele.second.end()) return ele.first;
-            }
-            return "";
-          };
-
-      for (auto* x_in : x->inlinks) {
-        std::string node_name = x_in->AsArg().name;
-        std::string arg_name = get_argname(node_name, inst.op_info()->inputs());
-        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name;
-        VLOG(4) << "-- input arg_name:" << arg_name << " "
-                << "-- node name:" << node_name;
-        auto type = inst.picked_kernel().GetInputDeclType(arg_name);
-        if (!x_in->AsArg().type) {
-          VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
-          if (x_in->AsArg().is_weight) {
-            SetWeightType(x_in, *type, lite_with_targets);
+      // Infering the input and output variable's place according to the
+      // declaration of I/O arguments of the picked kernel of the op
+      VLOG(4) << "Op " << op_info->Repr();
+      for (auto* in_node : node->inlinks) {
+        auto& var = in_node->AsArg();
+        const auto& var_name = var.name;
+        auto* var_type = &var.type;
+        std::string arg_name;
+        CHECK(op_info->GetInputArgname(var_name, &arg_name))
+            << "Can not find the input argument for var " << var_name;
+        VLOG(4) << " - input arg name:" << arg_name << " var name:" << var_name;
+        const auto* decl_type = kernel.GetInputDeclType(arg_name);
+        if (!(*var_type)) {
+          VLOG(4) << "set type " << *decl_type << " " << var_name;
+          if (var.is_weight) {
+            SetWeightType(in_node, *decl_type, with_targets);
          } else {
-            x_in->AsArg().type = type;
+            *var_type = decl_type;
          }
-        } else if (x_in->AsArg().type->target() == TARGET(kUnk) &&
-                   x_in->AsArg().type->precision() != PRECISION(kUnk) &&
-                   x_in->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+        } else if (!(*var_type)->place().is_valid()) {
          // If is quantization, infer the Int8 type.
-          if (type->precision() == PRECISION(kInt8)) {
-            x_in->AsArg().type = type;
+          if (decl_type->precision() == PRECISION(kInt8)) {
+            *var_type = decl_type;
          } else {
-            PrecisionType tmp_ptype = x_in->AsArg().type->precision();
-            x_in->AsArg().type = LiteType::GetTensorTy(
-                type->target(), tmp_ptype, type->layout());
+            UpdateTypeFrom(var_type, decl_type);
          }
        }
      }
-
-      VLOG(4) << "inst " << inst.op_info()->Repr();
-      for (auto* x_out : x->outlinks) {
-        std::string node_name = x_out->AsArg().name;
-        std::string arg_name =
-            get_argname(node_name, inst.op_info()->outputs());
-        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name << " in Inst "
-                                   << inst.op_type();
-        VLOG(4) << "-- output arg_name " << arg_name;
-        auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
-        if (!x_out->AsArg().type) {
-          VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
-          if (x_out->AsArg().is_weight) {
-            SetWeightType(x_out, *type, lite_with_targets);
+      for (auto* out_node : node->outlinks) {
+        auto& var = out_node->AsArg();
+        const auto& var_name = var.name;
+        auto* var_type = &var.type;
+        std::string arg_name;
+        CHECK(op_info->GetOutputArgname(var_name, &arg_name))
+            << "Can not find the output argument for var " << var_name;
+        VLOG(4) << " - output arg name:" << arg_name
+                << " var name:" << var_name;
+        const auto* decl_type = kernel.GetOutputDeclType(arg_name);
+        if (!(*var_type)) {
+          VLOG(4) << "set type " << *decl_type << " " << var_name;
+          if (var.is_weight) {
+            SetWeightType(out_node, *decl_type, with_targets);
          } else {
-            x_out->AsArg().type = type;
+            *var_type = decl_type;
          }
-        } else if (x_out->AsArg().type->target() == TARGET(kUnk) &&
-                   x_out->AsArg().type->precision() != PRECISION(kUnk) &&
-                   x_out->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+        } else if (!(*var_type)->place().is_valid()) {
          // If is quantization, infer the Int8 type.
-          if (type->precision() == PRECISION(kInt8)) {
-            x_out->AsArg().type = type;
-          } else if (type->precision() == PRECISION(kFP16) &&
-                     type->target() != TARGET(kOpenCL)) {
-            x_out->AsArg().type = type;
+          if (decl_type->precision() == PRECISION(kInt8) ||
+              (decl_type->precision() == PRECISION(kFP16) &&
+               decl_type->target() != TARGET(kOpenCL))) {
+            *var_type = decl_type;
          } else {
-            PrecisionType tmp_ptype = x_out->AsArg().type->precision();
-            x_out->AsArg().type = LiteType::GetTensorTy(
-                type->target(), tmp_ptype, type->layout());
+            UpdateTypeFrom(var_type, decl_type);
          }
        }
      }
    }
  }

-  // Update me's kUnk fields by other's fields.
-  void UpdatePlace(Place* me, const Place& other) {
-    CHECK(other.is_valid());
-    if (me->target == TARGET(kUnk)) {
-      me->target = other.target;
-    }
-    if (me->precision == PRECISION(kUnk)) {
-      me->precision = other.precision;
-    }
-    if (me->layout == DATALAYOUT(kUnk)) {
-      me->layout = other.layout;
-    }
-  }
-
 private:
-  // The default target for arguments, e.g. load weights to CPU memory for CUDA
-  // computation by default.
+  // The default target for arguments, e.g. load weights to CPU memory for
+  // CUDA computation by default.
  TargetType argument_default_target_{TARGET(kHost)};
 };


--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -99,7 +99,7 @@ class OpLite : public Registry {
  std::vector<std::unique_ptr<KernelBase>> CreateKernels(
      const std::vector<Place> &places, const std::string &kernel_type = "");

-  lite::Scope *scope() { return scope_; }
+  Scope *scope() { return scope_; }

  // Assign op param to kernel.
  virtual void AttachKernel(KernelBase *kernel) = 0;
@@ -169,7 +169,7 @@ class OpLite : public Registry {
  }

 protected:
-  lite::Scope *scope_{nullptr};
+  Scope *scope_{nullptr};
  std::unique_ptr<KernelBase> kernel_;
  std::string op_type_;
  std::vector<Place> valid_places_;

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -17,7 +17,9 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <utility>
 #include <vector>
+#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
 #include "lite/core/mir/generate_program_pass.h"
 #include "lite/core/mir/pass_manager.h"
 #include "lite/core/mir/pass_utils.h"
@@ -35,6 +37,9 @@ namespace lite {
 * lite::Optimizer optimize a program. It utilize the mir passes to analysis the
 * program and export an optimized program.
 */
+// TODO(hong1986032) Support the following passes for the subblocks
+const std::set<std::string> kSubblockUnsupportedPasses(
+    {"memory_optimize_pass"});
 class Optimizer {
 public:
  void Run(Program&& program,
@@ -44,14 +49,20 @@ class Optimizer {
    program_ = &program;
    valid_places_ = valid_places;
    CHECK(!valid_places.empty()) << "At least one valid_place should be set";
-    CHECK(!graph_) << "duplicate optimize found";
+    CHECK(graphs_.empty()) << "duplicate optimize found";

-    graph_.reset(new mir::SSAGraph);
-    graph_->Build(program, valid_places);
-    graph_->SetValidPlaces(valid_places);
+    auto block_size = program.block_size();
+    for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+      std::unique_ptr<mir::SSAGraph> graph;
+      graph.reset(new mir::SSAGraph);
+      graph->Build(program, valid_places, block_idx);
+      graph->SetValidPlaces(valid_places);
+      graphs_.emplace_back(std::move(graph));
+    }

    SpecifyKernelPickTactic(kernel_pick_factor);
    InitTargetTypeTransformPass();
+    InitControlFlowOpUnusedInputsAndOutputsEliminatePass();

    if (passes.empty() || passes.size() == 1) {
      std::vector<std::string> passes_local{
@@ -92,6 +103,7 @@ class Optimizer {
           "bm_subgraph_pass",
           "apu_subgraph_pass",
           "rknpu_subgraph_pass",
+           "control_flow_op_unused_inputs_and_outputs_eliminate_pass",
           "static_kernel_pick_pass",  // pick original kernel from graph
           "remove_tf_redundant_ops_pass",
           "variable_place_inference_pass",  // inference arg/var's
@@ -157,62 +169,15 @@ class Optimizer {
    exec_scope_ = program.exec_scope();
  }

-  const lite::Scope* exec_scope() const { return exec_scope_; }
-
-  // Set shape(dims) infos of var descs to scope var.
-  //  developer can write pass using input / output tensor dims of op.
-  //
-  // Example: If you have node `Node* softmax_node`,
-  //   you can get dims of output tensor in passes:
-  //
-  //   auto* scope = softmax_node->AsStmt().op()->scope();
-  //   auto softmax_out_arg_name =
-  //             softmax_node->outlinks.front()->AsArg().name;
-  //   auto softmax_out_tensor =
-  //             scope->FindVar(softmax_out_arg_name)->Get<lite::Tensor>();
-  //   softmax_out_dims = softmax_out_tensor.dims();
-  void SetVarDescShapeToScopeVar() {
-    auto dims_to_str_func = [](std::vector<int64_t> shape) -> std::string {
-      std::string str_res;
-      for (size_t i = 0; i < shape.size(); ++i) {
-        str_res += std::to_string(shape[i]);
-        if (i != shape.size() - 1) {
-          str_res += "x";
-        }
-      }
-      return str_res;
-    };
-
-    auto* program_desc = program_->program_desc();
-    VLOG(5) << "program_desc->BlocksSize():" << program_desc->BlocksSize();
-    auto blocks_desc = program_desc->GetBlocks();
-    for (size_t bidx = 0; bidx < blocks_desc.size(); ++bidx) {
-      auto block_desc = blocks_desc[bidx];
-      auto vars_desc = block_desc.GetVars();
-      for (size_t vidx = 0; vidx < vars_desc.size(); ++vidx) {
-        auto var_desc = vars_desc[vidx];
-        VLOG(5) << var_desc.Name() << " "
-                << dims_to_str_func(var_desc.GetShape());
-        if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
-        auto* var = program_->exec_scope()->FindVar(var_desc.Name());
-        auto tensor = var->GetMutable<lite::Tensor>();
-        if (tensor->dims().size() == 0 && var_desc.GetShape().size() != 0) {
-          VLOG(5) << "var_desc.Name():" << var_desc.Name()
-                  << " shape:" << dims_to_str_func(var_desc.GetShape());
-          tensor->Resize(var_desc.GetShape());
-        }
-        VLOG(5) << "var_desc.Name():" << var_desc.Name()
-                << " shape:" << dims_to_str_func(var_desc.GetShape())
-                << " tensor:" << tensor->dims();
-      }
-    }
-  }
+  const Scope* exec_scope() const { return exec_scope_; }

  // Generate a new program based on the mir graph.
  std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
    auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
        "generate_program_pass");
-    pass->Apply(graph_);
+    for (auto& graph : graphs_) {
+      pass->Apply(graph);
+    }
    auto program = pass->GenProgram();
    CHECK(exec_scope_);
    program->set_exec_scope(exec_scope_);
@@ -228,27 +193,38 @@ class Optimizer {
    pass->SetValidPlaces(valid_places_);
  }

+  void InitControlFlowOpUnusedInputsAndOutputsEliminatePass() {
+    auto* pass =
+        mir::PassManager::Global()
+            .LookUp<mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass>(
+                "control_flow_op_unused_inputs_and_outputs_eliminate_pass");
+    CHECK(pass);
+    CHECK(!graphs_.empty());
+    pass->SetAllGraphs(&graphs_);
+  }
+
  // Generate C++ code which combines the inference program, model and weights.
  void GenCode(const std::string& code_dir);

-  const mir::SSAGraph& ssa_graph() const {
-    CHECK(graph_);
-    return *graph_;
+  const mir::SSAGraph& ssa_graph(int block_idx = kRootBlockIdx) const {
+    CHECK(!graphs_.empty());
+    CHECK(graphs_[block_idx]);
+    return *graphs_[block_idx];
  }

-  mir::SSAGraph* mutable_ssa_graph() {
-    CHECK(graph_);
-    return graph_.get();
+  mir::SSAGraph* mutable_ssa_graph(int block_idx = kRootBlockIdx) {
+    CHECK(!graphs_.empty());
+    CHECK(graphs_[block_idx]);
+    return graphs_[block_idx].get();
  }

-  lite::Scope* exec_scope() { return exec_scope_; }
+  Scope* exec_scope() { return exec_scope_; }

 protected:
  void SpecifyKernelPickTactic(core::KernelPickFactor factor);

  // Specify the passes and run them.
  void RunPasses(const std::vector<std::string>& passes) {
-    SetVarDescShapeToScopeVar();
    for (auto& x : passes) {
      LOG(INFO) << "== Running pass: " << x;
      mir::Pass* pass = mir::PassManager::Global().LookUp(x);
@@ -266,16 +242,23 @@ class Optimizer {
        LOG(INFO) << "   - Skip " << x
                  << " because the target or kernel does not match.";
      } else {
-        pass->Apply(graph_);
+        // Check the pass whether it is supported for processing subblocks
+        if (kSubblockUnsupportedPasses.count(x)) {
+          pass->Apply(graphs_[kRootBlockIdx]);
+        } else {
+          for (auto& graph : graphs_) {
+            pass->Apply(graph);
+          }
+        }
        LOG(INFO) << "== Finished running: " << x;
      }
    }
  }

 private:
-  std::unique_ptr<mir::SSAGraph> graph_;
+  std::vector<std::unique_ptr<mir::SSAGraph>> graphs_;
  std::vector<Place> valid_places_;
-  lite::Scope* exec_scope_{};
+  Scope* exec_scope_{};
  Program* program_{};
 };


--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -14,6 +14,7 @@

 #pragma once
 #include <list>
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -38,58 +39,68 @@ static const char kKernelTypeAttr[] = "__@kernel_type_attr@__";
 // - scope: which contains all the weights
 struct Program {
 public:
-  explicit Program(const std::shared_ptr<Scope>& root) { scope_ = root; }
-  Program(const cpp::ProgramDesc& desc,
-          const std::shared_ptr<Scope>& root,
+  explicit Program(const std::shared_ptr<Scope>& root_scope) {
+    scope_ = root_scope;
+  }
+  Program(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
+          const std::shared_ptr<Scope>& root_scope,
          const std::vector<Place>& valid_places)
-      : scope_(root), valid_places_(valid_places), desc_(desc) {
+      : scope_(root_scope),
+        valid_places_(valid_places),
+        program_desc_(program_desc) {
    CHECK(scope_) << "scope should be init first";
    VLOG(4) << "prepare work";
-    PrepareWorkspace(desc);
+    PrepareWorkspace(program_desc_);
    VLOG(4) << "build desc";
-    Build(desc);
+    Build(program_desc_);
    VLOG(4) << "build desc finished";
  }

  std::unique_ptr<Program> Clone() const {
-    std::unique_ptr<Program> res(new Program(desc_, scope_, valid_places_));
-    return res;
+    return std::unique_ptr<Program>(
+        new Program(program_desc_, scope_, valid_places_));
  }

  const std::list<std::string>& weights() const { return weights_; }
-  const std::list<std::string>& tmp_vars() const { return tmp_vars_; }
+  const std::list<std::string>& vars() const { return vars_; }
  std::list<std::string>* mutable_weights() { return &weights_; }
-  std::list<std::string>* mutable_tmp_vars() { return &tmp_vars_; }
+  std::list<std::string>* mutable_vars() { return &vars_; }

-  const std::list<std::shared_ptr<OpLite>>& ops() const { return ops_; }
-  std::list<std::shared_ptr<OpLite>>* mutable_ops() { return &ops_; }
+  const std::list<std::shared_ptr<OpLite>>& ops(
+      int block_idx = kRootBlockIdx) const {
+    return ops_[block_idx];
+  }
+  std::list<std::shared_ptr<OpLite>>* mutable_ops(
+      int block_idx = kRootBlockIdx) {
+    return &ops_[block_idx];
+  }

-  lite::Scope* exec_scope() { return exec_scope_; }
-  lite::Scope* scope() { return scope_.get(); }
+  size_t block_size() { return ops_.size(); }

-  cpp::ProgramDesc* program_desc() { return &desc_; }
+  Scope* exec_scope() { return exec_scope_; }
+  Scope* scope() { return scope_.get(); }

-  const std::unordered_map<std::string, PrecisionType>& var_data_type() const {
-    return var_data_type_;
+  const std::map<std::string, const Type*>& var_type_map() const {
+    return var_type_map_;
  }

 private:
  // Build from a program and scope.
-  void Build(const cpp::ProgramDesc& program);
+  void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc);
  // Create temporary variables.
-  void PrepareWorkspace(const cpp::ProgramDesc& program);
+  void PrepareWorkspace(const std::shared_ptr<cpp::ProgramDesc>& program_desc);

 private:
-  std::unordered_map<std::string, PrecisionType> var_data_type_;
-  std::list<std::string> tmp_vars_;
+  std::map<std::string, const Type*> var_type_map_;
+  std::list<std::string> vars_;
  std::list<std::string> weights_;
-  std::list<std::shared_ptr<OpLite>> ops_;
+  std::vector<std::list<std::shared_ptr<OpLite>>> ops_;
  // the scope to run the kernels, NOTE this is the execution scope.
-  std::shared_ptr<lite::Scope> scope_;
+  std::shared_ptr<Scope> scope_;
  std::vector<Place> valid_places_;
  // Runtime scope.
-  lite::Scope* exec_scope_{};
-  cpp::ProgramDesc desc_;
+  Scope* exec_scope_{};
+  std::shared_ptr<cpp::ProgramDesc> program_desc_;
 };

 struct Instruction {
@@ -167,15 +178,14 @@ struct Instruction {
 */
 class LITE_API RuntimeProgram {
 public:
-  explicit RuntimeProgram(std::vector<Instruction>&& insts)
+  explicit RuntimeProgram(std::vector<std::vector<Instruction>>&& insts)
      : instructions_(std::move(insts)) {
-    if (instructions_.empty()) {
-      LOG(FATAL) << "no instructions";
-    }
-#ifdef LITE_WITH_PROFILE
-    set_profiler();
-#endif
+    Init();
  }
+  explicit RuntimeProgram(
+      const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+      Scope* exec_scope,
+      int block_idx = kRootBlockIdx);
  ~RuntimeProgram() {
 #ifdef LITE_WITH_PROFILE
    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
@@ -183,34 +193,46 @@ class LITE_API RuntimeProgram {
 #endif  // LITE_WITH_PROFILE
  }

+  void Init() {
+    if (instructions_.empty()) {
+      LOG(FATAL) << "no instructions";
+    }
+#ifdef LITE_WITH_PROFILE
+    set_profiler();
+#endif
+  }
+
  void Run();

-  void set_exec_scope(lite::Scope* x) { exec_scope_ = x; }
-  lite::Scope* exec_scope() { return exec_scope_; }
+  void set_exec_scope(Scope* x) { exec_scope_ = x; }
+  Scope* exec_scope() { return exec_scope_; }

-  size_t num_instructions() const { return instructions_.size(); }
+  const std::vector<Instruction>& instructions(
+      int block_idx = kRootBlockIdx) const {
+    return instructions_[block_idx];
+  }

-  const std::vector<Instruction>& instructions() const { return instructions_; }
+  std::vector<Instruction>* mutable_instructions(
+      int block_idx = kRootBlockIdx) {
+    return &instructions_[block_idx];
+  }

-  // `SaveOpInfosToProgram` will update the op list(ops_) of the block 0
-  // in ProgramDesc.
-  void SaveOpInfosToProgram(cpp::ProgramDesc* desc);
+  size_t block_size() { return instructions_.size(); }

-  // `UpdateVarsOfProgram` will update the var list(vars_) of the block 0 in
-  // ProgramDesc. Namely, if a new var created in some passes, its var_desc will
-  // be added in vars_.
-  void UpdateVarsOfProgram(cpp::ProgramDesc* desc);
+  // Update the ops and vars of all of blocks to the given program_desc
+  // according to the instructions
+  void SaveToProgram(std::shared_ptr<cpp::ProgramDesc> program_desc);

 private:
  RuntimeProgram(const RuntimeProgram&) = delete;
-  std::vector<Instruction> instructions_;
-  lite::Scope* exec_scope_{};
+  std::vector<std::vector<Instruction>> instructions_;
+  Scope* exec_scope_{};

 #ifdef LITE_WITH_PROFILE
  profile::Profiler profiler_;
  void set_profiler() {
-    for (auto i = instructions_.begin(); i != instructions_.end(); ++i) {
-      i->set_profiler(&profiler_);
+    for (auto& inst : instructions_[kRootBlockIdx]) {
+      inst.set_profiler(&profiler_);
    }
  }
 #endif

--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -37,7 +37,7 @@ bool SubgraphEngine::BuildDeviceProgram() {
  subgraph::apu::Graph graph;
  int neuron_errCode = NeuronModel_create(&model_);
  if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Fail to create model";
+    LOG(WARNING) << "[APU] Failed to create the neuron model!";
    return false;
  }
  graph.set_model(model_);
@@ -46,11 +46,12 @@ bool SubgraphEngine::BuildDeviceProgram() {

  // Convert all of ops and their input vars and weights and added into the APU
  // NIR graph
-  if (origin_program_.empty()) {
+  if (!origin_program_) {
    BuildOriginProgram();
  }
  const auto& bridges = subgraph::Registry::Instance();
-  for (auto& inst : origin_program_) {
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
    op->CheckShape();
@@ -70,55 +71,38 @@ bool SubgraphEngine::BuildDeviceProgram() {
    }
  }

-  // Get input tensor
-  std::vector<uint32_t> ins;
-  origin_itensors_.resize(input_names_.size());
-  origin_idims_.resize(input_names_.size());
+  // Get the index of input tensors
+  std::vector<uint32_t> input_indices;
  for (int i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-    VLOG(3) << "subgraph input name: " << i << ", " << input_names_[i] << ":"
-            << origin_idims_[i].production();
-    // Get input index
-    int idx;
-    if (graph.Has(input_names_[i])) {
-      ins.push_back(graph.Get(input_names_[i])->index());
-      VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index();
-    } else {
-      LOG(WARNING) << "Fail to find input: " << input_names_[i];
-      return false;
-    }
+    CHECK(graph.Has(input_names_[i])) << "[APU] Failed to find input node "
+                                      << input_names_[i];
+    auto index = graph.Get(input_names_[i])->index();
+    input_indices.push_back(index);
+    VLOG(3) << "[APU] Input[" << i << "] name " << input_names_[i] << " dims "
+            << origin_itensors_[i]->dims() << " index " << index;
  }

-  // Get output tensor
-  std::vector<uint32_t> outs;
-  origin_otensors_.resize(output_names_.size());
-  origin_odims_.resize(output_names_.size());
+  // Get the index of output tensors
+  std::vector<uint32_t> output_indices;
  for (int i = 0; i < output_names_.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-    VLOG(3) << "subgraph output name: " << i << ", " << output_names_[i] << ":"
-            << origin_odims_[i].production();
+    CHECK(graph.Has(output_names_[i])) << "[APU] Failed to find output node "
+                                       << output_names_[i];
    origin_otensors_[i]->mutable_data<int8_t>();
-    // Get input index
-    if (graph.Has(output_names_[i])) {
-      outs.push_back(graph.Get(output_names_[i])->index());
-      VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index();
-    } else {
-      LOG(WARNING) << "Fail to find output: " << output_names_[i];
-      return false;
-    }
+    auto index = graph.Get(output_names_[i])->index();
+    output_indices.push_back(index);
+    VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims "
+            << origin_otensors_[i]->dims() << " index " << index;
  }

-  VLOG(3) << "ins size: " << ins.size() << " outs size:" << outs.size();
-  // Set subgraph input/output
-  NeuronModel_identifyInputsAndOutputs(
-      model_, ins.size(), &ins[0], outs.size(), &outs[0]);
+  // Indentify the input and output tensors of the neuron model
+  NeuronModel_identifyInputsAndOutputs(model_,
+                                       input_indices.size(),
+                                       &input_indices[0],
+                                       output_indices.size(),
+                                       &output_indices[0]);
  neuron_errCode = NeuronModel_finish(model_);
  if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode;
+    LOG(WARNING) << "[APU] Fail to create NIR model:" << neuron_errCode;
    return false;
  }
  VLOG(3) << "[APU] APU NIR model created!";
@@ -209,11 +193,11 @@ SubgraphEngine::~SubgraphEngine() {
 void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                   param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
  CHECK(engine_);
 }


--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
@@ -31,12 +31,16 @@ class SubgraphEngine : public subgraph::Engine {
 public:
  SubgraphEngine(KernelContext *ctx,
                 int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                 const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}

  ~SubgraphEngine();


--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -76,8 +76,7 @@ add_kernel(anchor_generator_compute_arm ARM extra SRCS anchor_generator_compute.
 add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(assign_value_compute_arm ARM extra SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)

@@ -88,7 +87,6 @@ add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_comp
 add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)

--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -202,17 +202,13 @@ void ElementwiseMulCompute<T, PType>::Run() {
  }
 }

-template <>
-void ElementwiseMulCompute<int64_t, PRECISION(kInt64)>::Run() {
-  auto& param = this->template Param<operators::ElementwiseParam>();
-  lite::arm::math::elementwise_compute_basic<int64_t>(param, "mul", "");
-}
-
-void ElementwiseMulActivationCompute::Run() {
-  auto& param = Param<operators::FusionElementwiseActivationParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
+template <typename T, PrecisionType PType>
+void ElementwiseMulActivationCompute<T, PType>::Run() {
+  auto& param =
+      this->template Param<operators::FusionElementwiseActivationParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
  int axis = param.axis;
  std::string act_type = param.act_type;
  auto x_dims = param.X->dims();
@@ -221,21 +217,21 @@ void ElementwiseMulActivationCompute::Run() {
  if (x_dims.size() < y_dims.size() &&
      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
    if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu_broadcast<float>(
+      lite::arm::math::elementwise_mul_relu_broadcast<T>(
          y_data, x_data, out_data, pre, n, post);
    } else {
      LOG(FATAL) << "unsupported Activation type: " << act_type;
    }
  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
    if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu_broadcast(
+      lite::arm::math::elementwise_mul_relu_broadcast<T>(
          x_data, y_data, out_data, pre, n, post);
    } else {
      LOG(FATAL) << "unsupported Activation type: " << act_type;
    }
  } else {
    if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu(
+      lite::arm::math::elementwise_mul_relu<T>(
          x_data, y_data, out_data, x_dims.production());
    } else {
      LOG(FATAL) << "unsupported Activation type: " << act_type;
@@ -403,46 +399,60 @@ REGISTER_LITE_KERNEL(
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

-using elementwise_mul_float =
+using elementwise_mul_float_t =
    paddle::lite::kernels::arm::ElementwiseMulCompute<float, PRECISION(kFloat)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float, def)
+    elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float_t, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

-using elementwise_mul_int32 =
+using elementwise_mul_int32_t =
    paddle::lite::kernels::arm::ElementwiseMulCompute<int, PRECISION(kInt32)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32, def)
+    elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32_t, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .Finalize();

-using elementwise_mul_int64 =
+using elementwise_mul_int64_t =
    paddle::lite::kernels::arm::ElementwiseMulCompute<int64_t,
                                                      PRECISION(kInt64)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64, def)
+    elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64_t, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .Finalize();

-REGISTER_LITE_KERNEL(
-    fusion_elementwise_mul_activation,
-    kARM,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::arm::ElementwiseMulActivationCompute,
-    def)
+using fusion_elementwise_mul_activation_float_t = paddle::lite::kernels::arm::
+    ElementwiseMulActivationCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     fusion_elementwise_mul_activation_float_t,
+                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

+using fusion_elementwise_mul_activation_int64_t = paddle::lite::kernels::arm::
+    ElementwiseMulActivationCompute<int64_t, PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kARM,
+                     kInt64,
+                     kNCHW,
+                     fusion_elementwise_mul_activation_int64_t,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(elementwise_max,
                     kARM,
                     kFloat,
@@ -466,22 +476,22 @@ REGISTER_LITE_KERNEL(
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

-using elementwise_div_fp32 =
+using elementwise_div_fp32_t =
    paddle::lite::kernels::arm::ElementwiseDivCompute<float, PRECISION(kFloat)>;

 REGISTER_LITE_KERNEL(
-    elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32, def)
+    elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32_t, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

-using elementwise_div_int64 =
+using elementwise_div_int64_t =
    paddle::lite::kernels::arm::ElementwiseDivCompute<int64_t,
                                                      PRECISION(kInt64)>;

 REGISTER_LITE_KERNEL(
-    elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64, def)
+    elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64_t, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})

--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
@@ -62,8 +62,8 @@ class ElementwiseMulCompute : public KernelLite<TARGET(kARM), PType> {
  virtual ~ElementwiseMulCompute() = default;
 };

-class ElementwiseMulActivationCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ElementwiseMulActivationCompute : public KernelLite<TARGET(kARM), PType> {
 public:
  void Run() override;


--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
@@ -423,13 +423,15 @@ TEST(fusion_elementwise_mul_activation_arm, retrive_op) {
 }

 TEST(fusion_elementwise_mul_activation_arm, init) {
-  ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
+  ElementwiseMulActivationCompute<float, PRECISION(kFloat)>
+      fusion_elementwise_mul_activation;
  ASSERT_EQ(fusion_elementwise_mul_activation.precision(), PRECISION(kFloat));
  ASSERT_EQ(fusion_elementwise_mul_activation.target(), TARGET(kARM));
 }

 TEST(fusion_elementwise_mul_activation_arm, compute) {
-  ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
+  ElementwiseMulActivationCompute<float, PRECISION(kFloat)>
+      fusion_elementwise_mul_activation;
  operators::FusionElementwiseActivationParam param;
  lite::Tensor x, y, output, output_ref;


--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -20,44 +20,45 @@ namespace lite {
 namespace kernels {
 namespace arm {

-template <typename T>
+template <typename IndexType, typename DataType>
 void GatherFunc(const operators::GatherParam& param) {
  auto src_dims = param.X->dims();
  auto index_size = param.Index->dims()[0];
-  auto* p_src = param.X->data<T>();
-  const int* p_index = param.Index->data<int>();
-  auto* p_output = param.Out->mutable_data<T>();
+  auto* p_src = param.X->data<DataType>();
+  const IndexType* p_index = param.Index->data<IndexType>();
+  auto* p_output = param.Out->mutable_data<DataType>();

  int slice_size = 1;
  for (size_t i = 1; i < src_dims.size(); ++i) {
    slice_size *= src_dims[i];
  }
  for (int i = 0; i < index_size; ++i) {
-    int index_ = p_index[i];
+    IndexType index_ = p_index[i];
    memcpy(p_output + i * slice_size,
           p_src + index_ * slice_size,
-           slice_size * sizeof(T));
+           slice_size * sizeof(DataType));
  }
 }

-void GatherCompute::Run() {
-  auto& param = this->Param<operators::GatherParam>();
+template <typename IndexType>
+void GatherCompute<IndexType>::Run() {
+  auto& param = this->template Param<operators::GatherParam>();

  switch (param.X->precision()) {
    case PRECISION(kFloat):
-      GatherFunc<float>(param);
+      GatherFunc<IndexType, float>(param);
      break;
    case PRECISION(kInt8):
-      GatherFunc<int8_t>(param);
+      GatherFunc<IndexType, int8_t>(param);
      break;
    case PRECISION(kInt16):
-      GatherFunc<int16_t>(param);
+      GatherFunc<IndexType, int16_t>(param);
      break;
    case PRECISION(kInt32):
-      GatherFunc<int32_t>(param);
+      GatherFunc<IndexType, int32_t>(param);
      break;
    case PRECISION(kInt64):
-      GatherFunc<int64_t>(param);
+      GatherFunc<IndexType, int64_t>(param);
      break;
    default:
      LOG(FATAL) << "Gather does not implement for the "
@@ -70,9 +71,26 @@ void GatherCompute::Run() {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(
-    gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
+REGISTER_LITE_KERNEL(gather,
+                     kARM,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::arm::GatherCompute<int32_t>,
+                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindInput("Index", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(gather,
+                     kARM,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::arm::GatherCompute<int64_t>,
+                     def_int64_idx)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .Finalize();
--- a/lite/kernels/arm/gather_compute.h
+++ b/lite/kernels/arm/gather_compute.h
@@ -23,6 +23,7 @@ namespace lite {
 namespace kernels {
 namespace arm {

+template <typename IndexType>
 class GatherCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
 public:
  void Run() override;

--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -28,36 +28,17 @@ namespace lite {
 namespace kernels {
 namespace bm {

-bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
-  // Obtain the origin input tensors, and create the origin output
-  // tensors(Don't try to access them before launch the device program or the
-  // origin program)
-  PrepareWorkspaceForOriginProgram();
-  // Create the device input and output tensors, but don't initialize them
-  // with the dimensions
-  device_inputs_.resize(input_names_.size());
-  for (int i = 0; i < input_names_.size(); i++) {
-    device_inputs_[i].reset(new hiai::AiTensor);
-    CHECK(device_inputs_[i]);
-  }
-  device_outputs_.resize(output_names_.size());
-  for (int i = 0; i < output_names_.size(); i++) {
-    device_outputs_[i].reset(new hiai::AiTensor);
-    CHECK(device_outputs_[i]);
-  }
-  return true;
-}
-
 bool SubgraphEngine::BuildDeviceProgram() {
  int status = 0;
  subgraph::bm::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
  graph.CreateCompilerHandle();
  auto& ctx = this->ctx_->template As<BMContext>();
-  if (origin_program_.empty()) {
+  if (!origin_program_) {
    BuildOriginProgram();
  }
-  for (auto& inst : origin_program_) {
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
    op->CheckShape();
@@ -90,13 +71,11 @@ bool SubgraphEngine::BuildDeviceProgram() {
  net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]);
  auto& stage = net_info_->stages[0];
  // input
-  origin_idims_.resize(input_names_.size());
-  origin_itensors_.resize(input_names_.size());
  device_inputs_.resize(input_names_.size());
  for (size_t i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(net_info_->input_names[i]);
+    origin_itensors_[i] =
+        exec_scope_->FindMutableTensor(net_info_->input_names[i]);
    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
    bm_device_mem_t* p_mem =
        static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
    CHECK(p_mem != nullptr);
@@ -109,8 +88,6 @@ bool SubgraphEngine::BuildDeviceProgram() {
                            stage.input_shapes[i]);
  }
  // output
-  origin_odims_.resize(output_names_.size());
-  origin_otensors_.resize(output_names_.size());
  device_outputs_.resize(net_info_->output_num);
  int out_index = 0;
  for (int i = 0; i < output_names_.size(); i++) {
@@ -118,14 +95,13 @@ bool SubgraphEngine::BuildDeviceProgram() {
  }

  for (int i = 0; i < net_info_->output_num; i++) {
-    Tensor* t_cur = scope_->FindMutableTensor(net_info_->output_names[i]);
+    Tensor* t_cur = exec_scope_->FindMutableTensor(net_info_->output_names[i]);
    CHECK(t_cur != nullptr);
    bm_device_mem_t* p_mem =
        static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
    CHECK(p_mem != nullptr);
    if (outname_map_.find(net_info_->output_names[i]) != outname_map_.end()) {
      origin_otensors_[out_index] = t_cur;
-      origin_odims_[out_index] = origin_otensors_[out_index]->dims();
      origin_otensors_[out_index]->mutable_data<float>();
      out_index += 1;
    }
@@ -170,11 +146,11 @@ bool SubgraphEngine::LaunchDeviceProgram() {
 void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                   param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
  CHECK(engine_);
 }


--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
@@ -36,15 +36,18 @@ class SubgraphEngine : public subgraph::Engine {
 public:
  SubgraphEngine(KernelContext *ctx,
                 int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                 const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}

 protected:
-  bool PrepareWorkspaceForDeviceProgram() override;
  bool BuildDeviceProgram() override;
  bool LaunchDeviceProgram() override;


--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -13,3 +13,6 @@ add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${li
 add_kernel(write_to_array_compute_host Host extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(read_from_array_compute_host Host extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(assign_compute_host Host extra SRCS assign_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(print_compute_host Host extra SRCS print_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(while_compute_host Host extra SRCS while_compute.cc DEPS ${lite_kernel_deps} program)
+add_kernel(conditional_block_compute_host Host extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} program)
--- a/lite/kernels/host/assign_compute.cc
+++ b/lite/kernels/host/assign_compute.cc
@@ -51,3 +51,19 @@ REGISTER_LITE_KERNEL(
                                       PRECISION(kAny),
                                       DATALAYOUT(kAny))})
    .Finalize();
+
+REGISTER_LITE_KERNEL(assign,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::AssignCompute,
+                     def_tensor_array)
+    .BindInput("X",
+               {LiteType::GetTensorListTy(TARGET(kHost),
+                                          PRECISION(kAny),
+                                          DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorListTy(TARGET(kHost),
+                                           PRECISION(kAny),
+                                           DATALAYOUT(kAny))})
+    .Finalize();
--- a/lite/kernels/arm/conditional_block_compute.cc
+++ b/lite/kernels/arm/conditional_block_compute.cc
@@ -12,28 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/kernels/arm/conditional_block_compute.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
+#include "lite/kernels/host/conditional_block_compute.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

 void ConditionalBlockCompute::PrepareForRun() {
-  auto& param = Param<operators::ConditionalBlockParam>();
-  auto cur_scope = param.scope;
-
-  executor_ =
-      std::make_shared<CondExecutor>(param.sub_block, cur_scope, place());
+  auto& param = this->Param<param_t>();
+  program_.reset(new RuntimeProgram(
+      param.program_desc, param.exec_scope, param.block_idx));
 }
+
 void ConditionalBlockCompute::Run() {
-  auto& param = Param<operators::ConditionalBlockParam>();
+  auto& param = this->Param<param_t>();
  for (auto& out : param.outs) {
    out->clear();
  }
@@ -43,32 +36,40 @@ void ConditionalBlockCompute::Run() {
    auto* cond_data = cond->data<bool>();
    need_run = cond_data[0];
  } else {
-    auto x = param.x;
-    for (auto pt : x) {
-      if (pt == nullptr || !pt->IsInitialized() || pt->dims().empty()) {
+    for (auto input : param.inputs) {
+      if (input == nullptr || !input->IsInitialized() ||
+          input->dims().empty()) {
        need_run = false;
        break;
      }
    }
  }
  if (need_run) {
-    executor_->Run();
+    program_->Run();
  }
 }

-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle

 REGISTER_LITE_KERNEL(conditional_block,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ConditionalBlockCompute,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::ConditionalBlockCompute,
                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Cond", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Scope", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindInput("Cond",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorListTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("Scope",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
    .Finalize();
--- a/lite/kernels/arm/conditional_block_compute.h
+++ b/lite/kernels/arm/conditional_block_compute.h
@@ -15,92 +15,30 @@
 #pragma once
 #include <algorithm>
 #include <memory>
-#include <utility>
+#include <string>
 #include <vector>
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/program.h"
-#include "lite/operators/conditional_block_op.h"
-#ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
-#include "lite/core/profile/precision_profiler.h"
-#include "lite/core/profile/profiler.h"
-#endif

 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
-
-class CondExecutor {
-  typedef std::shared_ptr<OpLite> OpPtr;
-
- public:
-  CondExecutor(cpp::BlockDesc *block, Scope *scope, Place place)
-      : scope_(scope), place_(place) {
-    int32_t op_size = block->OpsSize();
-    for (int32_t i = 0; i < op_size; ++i) {
-      auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
-      auto op_type = op_desc.Type();
-      auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
-      op_handler->Attach(op_desc, scope);
-
-      auto hostplace = place_;
-      hostplace.target = TARGET(kHost);
-      auto kernels = op_handler->CreateKernels({place_, hostplace});
-      CHECK_GT(kernels.size(), 0) << "cannot create kernel";
-      op_handler->AttachKernel(kernels[0].get());
-      op_handler->SetKernel(kernels);
-      ops_of_block_.push_back(op_handler);
-    }
-  }
-
-  void Run() {
-#ifdef LITE_WITH_PROFILE
-#ifdef LITE_WITH_PRECISION_PROFILE
-    lite::profile::Profiler profiler;
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
-    for (auto &op_handler : ops_of_block_) {
-      op_handler->CheckShape();
-      op_handler->InferShape();
-#ifdef LITE_WITH_PROFILE
-#ifdef LITE_WITH_PRECISION_PROFILE
-      std::unique_ptr<KernelBase> kernel(op_handler->GetKernel());
-      Instruction inst(op_handler, std::move(kernel));
-      inst.set_profiler(&profiler);
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
-      op_handler->Run();
-#ifdef LITE_WITH_PROFILE
-#ifdef LITE_WITH_PRECISION_PROFILE
-      LITE_PRECISION_PROFILE(inst)
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
-    }
-  }
-
- private:
-  Scope *scope_;
-  Place place_;
-  std::vector<OpPtr> ops_of_block_;
-};
+namespace host {

 class ConditionalBlockCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
 public:
  using param_t = operators::ConditionalBlockParam;

  void PrepareForRun() override;
  void Run() override;

-  virtual ~ConditionalBlockCompute() = default;
-
 private:
-  std::shared_ptr<CondExecutor> executor_;
+  std::unique_ptr<RuntimeProgram> program_;
 };

-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/host/print_compute.cc
+++ b/lite/kernels/host/print_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/print_compute.h"
+
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+const char kForward[] = "FORWARD";
+const char kBackward[] = "BACKWARD";
+const char kBoth[] = "BOTH";
+
+class TensorFormatter {
+ public:
+  TensorFormatter() {}
+
+  std::string Format(const Tensor& print_tensor,
+                     const std::string& tensor_name = "",
+                     const std::string& message = "") {
+    std::stringstream log_stream;
+    if (!tensor_name.empty()) {
+      log_stream << "Variable: " << tensor_name << std::endl;
+    }
+
+    if (!message.empty()) {
+      log_stream << "  - message: " << message << std::endl;
+    }
+
+    if (print_tensor_lod_) {
+      log_stream << "  - lod: {";
+      const LoD& lod = print_tensor.lod();
+      for (auto level : lod) {
+        log_stream << "{";
+        bool is_first = true;
+        for (auto i : level) {
+          if (is_first) {
+            log_stream << i;
+            is_first = false;
+          } else {
+            log_stream << ", " << i;
+          }
+        }
+        log_stream << "}";
+      }
+      log_stream << "}" << std::endl;
+    }
+
+    log_stream << "  - place: " << TargetToStr(print_tensor.target())
+               << std::endl;  // TODO(hong19860320) always kHost
+
+    if (print_tensor_shape_) {
+      log_stream << "  - shape: " << print_tensor.dims().repr() << std::endl;
+    }
+
+    if (print_tensor_layout_) {
+      log_stream << "  - layout: "
+                 << DataLayoutToStr(
+                        DATALAYOUT(kNCHW))  // TODO(hong19860320) Query the data
+                                            // layout from target tensor
+                 << std::endl;
+    }
+
+    auto dtype = print_tensor.precision();
+    if (print_tensor_type_) {
+      log_stream << "  - dtype: " << PrecisionToStr(dtype) << std::endl;
+    }
+
+    if (dtype == PRECISION(kBool)) {
+      FormatData<bool>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt8)) {
+      FormatData<int8_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt16)) {
+      FormatData<int16_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt32)) {
+      FormatData<int32_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt64)) {
+      FormatData<int64_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kFloat)) {
+      FormatData<float>(print_tensor, log_stream);
+    } else {
+      log_stream << "\tdata: unprintable type: " << PrecisionToStr(dtype)
+                 << std::endl;
+    }
+    return log_stream.str();
+  }
+
+  void Print(const Tensor& print_tensor,
+             const std::string& tensor_name = "",
+             const std::string& message = "") {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    std::cout << Format(print_tensor, tensor_name, message);
+  }
+
+  void SetPrintTensorType(bool print_tensor_type) {
+    print_tensor_type_ = print_tensor_type;
+  }
+  void SetPrintTensorShape(bool print_tensor_shape) {
+    print_tensor_shape_ = print_tensor_shape;
+  }
+  void SetPrintTensorLod(bool print_tensor_lod) {
+    print_tensor_lod_ = print_tensor_lod;
+  }
+  void SetPrintTensorLayout(bool print_tensor_layout) {
+    print_tensor_layout_ = print_tensor_layout;
+  }
+  void SetSummarize(int64_t summarize) { summarize_ = summarize; }
+
+ private:
+  template <typename T>
+  void FormatData(const Tensor& print_tensor, std::stringstream& log_stream) {
+    int64_t print_size = summarize_ == -1
+                             ? print_tensor.numel()
+                             : std::min(summarize_, print_tensor.numel());
+    const T* data = print_tensor.data<T>();  // Always kHost, so unnessary to
+                                             // copy the data from device
+    log_stream << "  - data: [";
+    if (print_size > 0) {
+      log_stream << data[0];
+      for (int64_t i = 1; i < print_size; ++i) {
+        log_stream << " " << data[i];
+      }
+    }
+    log_stream << "]" << std::endl;
+  }
+
+  int64_t summarize_ = -1;
+  bool print_tensor_type_ = true;
+  bool print_tensor_shape_ = true;
+  bool print_tensor_lod_ = true;
+  bool print_tensor_layout_ = true;
+};
+
+void PrintCompute::Run() {
+  auto& param = Param<param_t>();
+  param.out->CopyDataFrom(*param.in);
+
+  if ((param.is_forward && param.print_phase == kBackward) ||
+      (!param.is_forward && param.print_phase == kForward)) {
+    return;
+  }
+
+  int first_n = param.first_n;
+  if (first_n > 0 && ++times_ > first_n) return;
+
+  TensorFormatter formatter;
+  const std::string& name = param.print_tensor_name ? param.name : "";
+  formatter.SetPrintTensorType(param.print_tensor_type);
+  formatter.SetPrintTensorShape(param.print_tensor_shape);
+  formatter.SetPrintTensorLod(param.print_tensor_lod);
+  formatter.SetPrintTensorLayout(param.print_tensor_layout);
+  formatter.SetSummarize(static_cast<int64_t>(param.summarize));
+  formatter.Print(*param.in, name, param.message);
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    print, kHost, kAny, kAny, paddle::lite::kernels::host::PrintCompute, def)
+    .BindInput("In",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
--- a/lite/kernels/host/print_compute.h
+++ b/lite/kernels/host/print_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class PrintCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  using param_t = operators::PrintParam;
+
+  void Run() override;
+
+  virtual ~PrintCompute() = default;
+
+ private:
+  mutable int times_{0};
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/arm/while_compute.cc
+++ b/lite/kernels/arm/while_compute.cc
@@ -12,44 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/kernels/arm/while_compute.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
+#include "lite/kernels/host/while_compute.h"
+#include <unordered_map>
+#include <utility>

 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

 void WhileCompute::PrepareForRun() {
-  auto &param = Param<operators::WhileParam>();
-  auto cur_scope = param.scope;
-
-  executor_ =
-      std::make_shared<StepExecutor>(param.sub_block, cur_scope, place());
+  auto &param = this->Param<param_t>();
+  program_.reset(new RuntimeProgram(
+      param.program_desc, param.exec_scope, param.block_idx));
 }
 void WhileCompute::Run() {
-  auto &param = Param<operators::WhileParam>();
+  auto &param = this->Param<param_t>();
  while (param.cond->data<bool>()[0]) {
-    executor_->Run();
+    program_->Run();
  }
 }

-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle

 REGISTER_LITE_KERNEL(
-    while, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::WhileCompute, def)
-    .BindInput("X", {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
+    while, kHost, kAny, kAny, paddle::lite::kernels::host::WhileCompute, def)
+    .BindInput("X",
+               {LiteType::GetTensorListTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
    .BindInput("Condition",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
    .BindOutput("Out",
-                {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
-    .BindOutput("StepScopes", {LiteType::GetTensorTy(TARGET(kARM))})
+                {LiteType::GetTensorListTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("StepScopes",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
    .Finalize();
--- a/lite/kernels/arm/while_compute.h
+++ b/lite/kernels/arm/while_compute.h
@@ -15,56 +15,19 @@
 #pragma once
 #include <algorithm>
 #include <memory>
+#include <string>
 #include <vector>
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
-#include "lite/operators/while_op.h"
+#include "lite/core/program.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

-class StepExecutor {
-  typedef std::shared_ptr<OpLite> OpPtr;
-
- public:
-  StepExecutor(cpp::BlockDesc *block, Scope *scope, Place place)
-      : scope_(scope), place_(place) {
-    int32_t op_size = block->OpsSize();
-    for (int32_t i = 0; i < op_size; ++i) {
-      auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
-      auto op_type = op_desc.Type();
-      auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
-      // VLOG(4) << "while: creating Op [" << op_type << "]";
-      op_handler->Attach(op_desc, scope);
-
-      auto hostplace = place_;
-      hostplace.target = TARGET(kHost);
-      auto kernels = op_handler->CreateKernels({place_, hostplace});
-      CHECK_GT(kernels.size(), 0) << "cannot create kernel";
-      op_handler->AttachKernel(kernels[0].get());
-      op_handler->SetKernel(kernels);
-      ops_of_block_.push_back(op_handler);
-    }
-  }
-
-  void Run() {
-    for (auto &op_handler : ops_of_block_) {
-      // VLOG(4) << op_handler->op_info()->Repr();
-      op_handler->InferShape();
-      // VLOG(4) << "while: infered shape";
-      op_handler->Run();
-    }
-  }
-
- private:
-  Scope *scope_;
-  Place place_;
-  std::vector<OpPtr> ops_of_block_;
-};
-
-class WhileCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class WhileCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
 public:
  using param_t = operators::WhileParam;

@@ -74,10 +37,10 @@ class WhileCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  virtual ~WhileCompute() = default;

 private:
-  std::shared_ptr<StepExecutor> executor_;
+  std::unique_ptr<RuntimeProgram> program_;
 };

-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -36,13 +36,17 @@ class SubgraphEngine : public subgraph::Engine {
 public:
  SubgraphEngine(KernelContext* ctx,
                 int block_idx,
-                 cpp::BlockDesc* block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+                 Scope* exec_scope,
                 const std::vector<std::string>& input_names,
                 const std::vector<std::string>& output_names,
-                 Scope* scope,
                 ::paddle::lite_api::PrecisionType type)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {
    graph_.SetFPType(type);
  }

@@ -51,7 +55,7 @@ class SubgraphEngine : public subgraph::Engine {
    int status = 0;
    // Convert all of input data vars and added into the MLU IR graph
    for (auto& input_name : input_names_) {
-      auto input_tensor = scope_->FindMutableTensor(input_name);
+      auto input_tensor = exec_scope_->FindMutableTensor(input_name);
      CHECK(input_tensor);
      auto input_node =
          graph_.AddNode(input_name,
@@ -71,7 +75,8 @@ class SubgraphEngine : public subgraph::Engine {
    if (origin_program_.empty()) {
      BuildOriginProgram();
    }
-    for (auto& inst : origin_program_) {
+    const auto& insts = origin_program_->instructions(kRootBlockIdx);
+    for (auto& inst : insts) {
      auto op = inst.op();
      CHECK(op);
      std::string op_type = op->op_info()->Type();
@@ -96,7 +101,7 @@ class SubgraphEngine : public subgraph::Engine {
    for (auto& output_name : output_names_) {
      if (graph_.HasNode(output_name)) {
        graph_.AddOutput(graph_.GetNode(output_name));
-        auto output_tensor = scope_->FindMutableTensor(output_name);
+        auto output_tensor = exec_scope_->FindMutableTensor(output_name);
        void* p_data = static_cast<void*>(
            output_tensor->mutable_data<typename ::paddle::lite::subgraph::mlu::
                                            FPTypeTraits<Precision>::T>(
@@ -144,11 +149,11 @@ class SubgraphCompute
    auto& param = this->template Param<param_t>();
    // LOG(INFO) << "SUBGRAP Prepare RUN index " << param.sub_block_idx;
    engine_.reset(new SubgraphEngine<Precision>(this->ctx_.get(),
-                                                param.sub_block_idx,
-                                                param.sub_block_desc,
+                                                param.block_idx,
+                                                param.program_desc,
+                                                param.exec_scope,
                                                param.input_data_names,
                                                param.output_data_names,
-                                                param.scope,
                                                this->precision()));
    CHECK(engine_);
  }

--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
@@ -25,11 +25,14 @@ namespace subgraph {

 Engine::Engine(KernelContext *ctx,
               int block_idx,
-               cpp::BlockDesc *block_desc,
+               const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+               Scope *exec_scope,
               const std::vector<std::string> &input_names,
-               const std::vector<std::string> &output_names,
-               lite::Scope *scope)
-    : ctx_(ctx), block_idx_(block_idx), block_desc_(block_desc), scope_(scope) {
+               const std::vector<std::string> &output_names)
+    : ctx_(ctx),
+      block_idx_(block_idx),
+      program_desc_(program_desc),
+      exec_scope_(exec_scope) {
  input_names_ = input_names;
  output_names_ = output_names;
  // Sort the name of input and output tensors, it's convenient for us to get
@@ -55,12 +58,12 @@ bool Engine::PrepareWorkspaceForOriginProgram() {
  origin_idims_.resize(input_names_.size());
  origin_itensors_.resize(input_names_.size());
  for (int i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    origin_itensors_[i] = exec_scope_->FindMutableTensor(input_names_[i]);
    CHECK(origin_itensors_[i]);
  }
  origin_otensors_.resize(output_names_.size());
  for (int i = 0; i < output_names_.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    origin_otensors_[i] = exec_scope_->FindMutableTensor(output_names_[i]);
    CHECK(origin_otensors_[i]);
  }
  return true;
@@ -69,70 +72,20 @@ bool Engine::PrepareWorkspaceForOriginProgram() {
 bool Engine::BuildOriginProgram() {
  // TODO(hong19860320) The block_desc need to be divided into subgraphs during
  // the exection time. But only see them as a subgraph now.
-  origin_program_.clear();
-  for (size_t op_idx = 0; op_idx < block_desc_->OpsSize(); op_idx++) {
-    auto op_desc = block_desc_->GetOp<cpp::OpDesc>(op_idx);
-    CHECK(op_desc);
-    std::string op_type = op_desc->Type();
-    // Create op and pick up the best kernel
-    auto op = LiteOpRegistry::Global().Create(op_desc->Type());
-    CHECK(op) << "no Op found for " << op_type;
-    op->Attach(*op_desc, scope_);
-    std::unique_ptr<KernelBase> picked_kernel;
-    if (op_desc->HasAttr(kKernelTypeAttr)) {
-      // Create op and pick up the best kernel according to the
-      // kKernelTypeAttr attribute
-      auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr);
-      std::string alias;
-      Place place;
-      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-      VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type
-              << " for " << op_type;
-      auto kernels = op->CreateKernels({place});
-      CHECK_GT(kernels.size(), 0u) << "No kernels found for " << op_type;
-      auto it = std::find_if(
-          kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase> &it) {
-            return it->alias() == alias;
-          });
-      CHECK(it != kernels.end());
-      picked_kernel = std::move(*it);
-    } else {
-      // TODO(hong19860320) add kernel picking according to the type of input
-      // and output tensors
-      VLOG(3) << "The attr '" << kKernelTypeAttr
-              << "' not found, pick the first kernel for " << op_type;
-      std::vector<std::unique_ptr<KernelBase>> kernels;
-#if defined(LITE_WITH_ARM)
-      kernels = op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}});
-#elif defined(LITE_WITH_X86)
-      kernels = op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}});
-#endif
-      if (kernels.size() > 0) {
-        picked_kernel = std::move(kernels.front());
-      } else {
-        LOG(WARNING) << "No kernels found for " << op_type;
-      }
-    }
-    if (picked_kernel != nullptr) {
-      picked_kernel->SetContext(
-          ContextScheduler::Global().NewContext(picked_kernel->target()));
-    }
-    origin_program_.emplace_back(std::move(op), std::move(picked_kernel));
+  if (!origin_program_) {
+    origin_program_.reset(
+        new RuntimeProgram(program_desc_, exec_scope_, block_idx_));
  }
-  CHECK(!origin_program_.empty()) << "no instructions";
  return true;
 }

 bool Engine::LaunchOriginProgram() {
-  if (origin_program_.empty()) {
+  if (!origin_program_) {
    BuildOriginProgram();
  }
-  if (!origin_program_.empty()) {
-    for (auto &inst : origin_program_) {
-      auto op_type = inst.op()->op_info()->Type();
-      if (op_type == "feed" || op_type == "fetch") continue;
-      inst.Run();
-    }
+  if (origin_program_) {
+    VLOG(3) << "Roll back to run the origin program.";
+    origin_program_->Run();
    return true;
  }
  return false;

--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
@@ -30,10 +30,10 @@ class Engine {
 public:
  Engine(KernelContext *ctx,
         int block_idx,
-         cpp::BlockDesc *block_desc,
+         const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+         Scope *exec_scope,
         const std::vector<std::string> &input_names,
-         const std::vector<std::string> &output_names,
-         lite::Scope *scope);
+         const std::vector<std::string> &output_names);
  virtual ~Engine() = default;

  virtual bool Run();
@@ -54,15 +54,15 @@ class Engine {

  KernelContext *ctx_{nullptr};
  int block_idx_{-1};
-  cpp::BlockDesc *block_desc_{nullptr};
+  const std::shared_ptr<const cpp::ProgramDesc> program_desc_{nullptr};
  std::vector<std::string> input_names_;
  std::vector<std::string> output_names_;
-  Scope *scope_{nullptr};
+  Scope *exec_scope_{nullptr};
  bool is_first_epoch_{true};
  std::vector<std::vector<int64_t>> origin_idims_;
  std::vector<Tensor *> origin_itensors_;
  std::vector<Tensor *> origin_otensors_;
-  std::vector<Instruction> origin_program_;
+  std::unique_ptr<RuntimeProgram> origin_program_{nullptr};
 };

 }  // namespace subgraph

--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -55,7 +55,8 @@ std::string DeviceProgram::GenerateModelName(
 }

 // Deserialize the generated model, the precisions and dimensions of the origin
-// output tensors of the subgraph op into files
+// output tensors of the subgraph op from the cached configuration file and HiAI
+// om file
 bool DeviceProgram::LoadFromCacheFile(
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
@@ -71,7 +72,7 @@ bool DeviceProgram::LoadFromCacheFile(
  VLOG(3) << "[NPU] Load model from " << model_path;
  std::vector<char> model_buffer;
  if (!ReadFile(model_path, &model_buffer)) {
-    LOG(WARNING) << "[NPU] read from " << model_path << " failed!";
+    LOG(WARNING) << "[NPU] Open " << model_path << " for reading failed!";
    return false;
  }
  bool model_comp = false;
@@ -98,9 +99,9 @@ bool DeviceProgram::LoadFromCacheFile(
    LOG(WARNING) << "[NPU] read from " << config_path << " failed!";
    return false;
  }
-  std::string config_str(config_buffer.begin(), config_buffer.end());
+  std::string str(config_buffer.begin(), config_buffer.end());
  // Parse the precision and shapes of the output tensors
-  auto output_options = Split<std::string>(config_str, ";");
+  auto output_options = Split<std::string>(str, ";");
  CHECK_EQ(output_options.size(), output_names.size());
  origin_otypes_.resize(output_names.size());
  origin_odims_.resize(output_names.size());
@@ -114,7 +115,7 @@ bool DeviceProgram::LoadFromCacheFile(
 }

 bool DeviceProgram::BuildGraphAndCacheToFile(
-    const std::vector<Instruction>& origin_program,
+    RuntimeProgram* origin_program,
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    const std::vector<std::vector<int64_t>>& origin_idims,
@@ -127,10 +128,13 @@ bool DeviceProgram::BuildGraphAndCacheToFile(
  // Convert all of ops and their input vars and weights to HiAI IR nodes,
  // then added them into the HiAI IR graph
  int status = 0;
-  CHECK(!origin_program.empty()) << "no instructions";
  subgraph::npu::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
-  for (auto& inst : origin_program) {
+  CHECK(origin_program) << "[NPU] The origin program is not initialized!";
+  CHECK_GT(origin_program->instructions(kRootBlockIdx).size(), 0)
+      << "[NPU] No instructions found in the origin program!";
+  const auto& insts = origin_program->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
    op->CheckShape();
@@ -149,7 +153,8 @@ bool DeviceProgram::BuildGraphAndCacheToFile(
  // Collect the input and output nodes of the HiAI IR graph
  std::vector<ge::Operator> device_inodes;
  for (size_t i = 0; i < input_names.size(); i++) {
-    CHECK(graph.Has(input_names[i]) && graph.Get(input_names[i])->is_data());
+    CHECK(graph.Has(input_names[i]));
+    CHECK(graph.Get(input_names[i])->is_data());
    device_inodes.push_back(*graph.Get(input_names[i])->data());
  }
  std::vector<ge::Operator> device_onodes;
@@ -173,6 +178,9 @@ bool DeviceProgram::BuildGraphAndCacheToFile(
    LOG(WARNING) << "[NPU] Load model failed!";
    return false;
  }
+  // Do not check model compatibility because it assume that the cached om model
+  // is always compatible with the current device
+  // Update the precison and dimensions of the origin output tensors
  // Update the precison and dimensions of the origin output tensors
  CHECK_EQ(origin_otensors.size(), output_names.size());
  origin_otypes_.resize(output_names.size());
@@ -247,7 +255,7 @@ bool DeviceProgram::ShareBufferWithOriginTensors(
                 device_idims_[i].GetHeight() * device_idims_[i].GetWidth());
    VLOG(3) << "[NPU] Init the input tensors for the device program and share "
               "their buffers with the origin input tensors";
-    // reinit device tensor will free shared buffer, so copy data to a tmp
+    // Reinit device tensor will free shared buffer, so copy data to a tmp
    // tensor
    Tensor tmp;
    tmp.CopyDataFrom(*(*origin_itensors)[i]);
@@ -337,8 +345,9 @@ bool SubgraphEngine::BuildDeviceProgram() {
  if (!device_programs_.count(origin_idims_)) {
    auto device_program = std::make_shared<DeviceProgram>();
    // Obtain the model cache dir from the NPU Context of the subgraph op
-    auto model_cache_dir = ctx_->As<NPUContext>().SubgraphModelCacheDir();
-    VLOG(3) << "[NPU] Getting subgraph model_cache_dir is: " << model_cache_dir;
+    auto model_cache_dir =
+        ctx_->As<NPUContext>().SubgraphModelCacheDir(exec_scope_);
+    VLOG(3) << "[NPU] Getting subgraph_model_cache_dir: " << model_cache_dir;
    // Check and load if the cached model and configuration file exists
    if (model_cache_dir.empty() ||
        !device_program->LoadFromCacheFile(
@@ -346,11 +355,13 @@ bool SubgraphEngine::BuildDeviceProgram() {
      // Build the model online, including converting the paddle ops to the HiAI
      // IR nodes, building the HiAI IR graph to the om model, then load it as a
      // new HiAI model manager client for inference.
-      if (origin_program_.empty()) {
+      if (!origin_program_) {
        BuildOriginProgram();
      }
-      CHECK(!origin_program_.empty()) << "no instructions";
-      if (!device_program->BuildGraphAndCacheToFile(origin_program_,
+      CHECK(origin_program_) << "[NPU] The origin program is not initialized!";
+      CHECK_GT(origin_program_->instructions().size(), 0)
+          << "[NPU] No instructions found in the origin program!";
+      if (!device_program->BuildGraphAndCacheToFile(origin_program_.get(),
                                                    input_names_,
                                                    output_names_,
                                                    origin_idims_,
@@ -391,11 +402,11 @@ bool SubgraphEngine::LaunchDeviceProgram() {
 void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                   param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
  CHECK(engine_);
 }


--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -41,7 +41,7 @@ class DeviceProgram {
                         const std::vector<std::vector<int64_t>>& origin_idims,
                         const std::string& model_cache_dir);
  bool BuildGraphAndCacheToFile(
-      const std::vector<Instruction>& origin_program,
+      RuntimeProgram* origin_program,
      const std::vector<std::string>& input_names,
      const std::vector<std::string>& output_names,
      const std::vector<std::vector<int64_t>>& origin_idims,
@@ -71,12 +71,16 @@ class SubgraphEngine : public subgraph::Engine {
 public:
  SubgraphEngine(KernelContext* ctx,
                 int block_idx,
-                 cpp::BlockDesc* block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+                 Scope* exec_scope,
                 const std::vector<std::string>& input_names,
-                 const std::vector<std::string>& output_names,
-                 Scope* scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string>& output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}

 protected:
  bool PrepareWorkspaceForDeviceProgram() override;

--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
@@ -28,26 +28,6 @@ namespace lite {
 namespace kernels {
 namespace rknpu {

-bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
-  // Obtain the origin input tensors, and create the origin output
-  // tensors(Don't try to access them before launch the device program or the
-  // origin program)
-  PrepareWorkspaceForOriginProgram();
-  // Create the device input and output tensors, but don't initialize them
-  // with the dimensions
-  device_itensors_.resize(input_names_.size());
-  for (int i = 0; i < input_names_.size(); i++) {
-    device_itensors_[i].reset(new hiai::AiTensor);
-    CHECK(device_itensors_[i]);
-  }
-  device_otensors_.resize(output_names_.size());
-  for (int i = 0; i < output_names_.size(); i++) {
-    device_otensors_[i].reset(new hiai::AiTensor);
-    CHECK(device_otensors_[i]);
-  }
-  return true;
-}
-
 bool SubgraphEngine::BuildDeviceProgram() {
  LOG(INFO) << "[RKNPU]:BuildDeviceProgram";
  int status = 0;
@@ -55,10 +35,11 @@ bool SubgraphEngine::BuildDeviceProgram() {
  // RKNPU IR graph
  subgraph::rknpu::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
-  if (origin_program_.empty()) {
+  if (!origin_program_) {
    BuildOriginProgram();
  }
-  for (auto& inst : origin_program_) {
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
    op->CheckShape();
@@ -76,92 +57,26 @@ bool SubgraphEngine::BuildDeviceProgram() {
  }
  // Collect the valid input and output nodes in the RKNPU IR graph and update
  // the input and output names
-  device_inames_.clear();
-  device_onames_.clear();
-
-  for (auto& input_name : input_names_) {
-    LOG(INFO) << "[RKNPU] Input node " << input_name;
-    if (graph.Has(input_name)) {
-      LOG(INFO) << input_name << " Precision "
-                << PrecisionToStr(graph.Get(input_name)->precision());
-      device_itensors_.push_back(graph.Get(input_name)->data());
-      device_inames_.push_back(input_name);
-    } else {
-      LOG(WARNING) << "[RKNPU] Input node " << input_name
-                   << " is ignored because it does not exist.";
-    }
-  }
-
-  for (auto& output_name : output_names_) {
-    LOG(INFO) << "[RKNPU] Output node " << output_name;
-    if (graph.Has(output_name)) {
-      auto tensor = scope_->FindMutableTensor(output_name);
-      LOG(INFO) << output_name << " Precision "
-                << PrecisionToStr(tensor->precision());
-      device_otensors_.push_back(graph.Get(output_name)->data());
-      device_onames_.push_back(output_name);
-    } else {
-      LOG(WARNING) << "[RKNPU] Output node " << output_name
-                   << " is ignored because it does not exist.";
-    }
-  }
-  CHECK(!device_inames_.empty())
-      << "[RKNPU] No input nodes found for building NPU model";
-  CHECK(!device_onames_.empty())
-      << "[RKNPU] No output nodes found for building NPU model";
-
-  device_program_ = lite::rknpu::Device::Global().Build(
-      model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
-  if (device_program_ == nullptr) {
-    LOG(WARNING) << "[RKNPU] Build model failed!";
-    return false;
-  }
-
-  // input
-  origin_idims_.resize(input_names_.size());
-  origin_itensors_.resize(input_names_.size());
+  device_itensors_.clear();
+  device_otensors_.clear();
  for (size_t i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-  }
-  // output
-  origin_odims_.resize(output_names_.size());
-  origin_otensors_.resize(output_names_.size());
-  for (size_t i = 0; i < output_names_.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-
-    auto output_dims = origin_otensors_[i]->dims();
-  }
-
-  origin_idims_.resize(device_inames_.size());
-  origin_itensors_.resize(device_inames_.size());
-  device_itensors_.resize(device_inames_.size());
-  origin_odims_.resize(device_onames_.size());
-  origin_otensors_.resize(device_onames_.size());
-  device_otensors_.resize(device_onames_.size());
-  for (int i = 0; i < device_inames_.size(); i++) {
-    auto node = graph.Get(device_inames_[i]);
+    CHECK(graph.Has(input_names_[i])) << "[RKNPU] Failed to find input node "
+                                      << input_names_[i];
+    auto node = graph.Get(input_names_[i]);
    auto precision = node->precision();
    auto layout = node->layout();
-    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-
-    LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << device_inames_[i]
+    LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << input_names_[i]
              << " precision: " << PrecisionToStr(precision)
              << " layout: " << DataLayoutToStr(layout);
+    device_itensors_.push_back(node->data());
  }
-  for (int i = 0; i < device_onames_.size(); i++) {
-    auto node = graph.Get(device_onames_[i]);
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    CHECK(graph.Has(output_names_[i])) << "[RKNPU] Failed to find output node "
+                                       << output_names_[i];
+    auto node = graph.Get(output_names_[i]);
    auto precision = node->precision();
    auto layout = node->layout();
-    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-    LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << device_onames_[i]
+    LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << output_names_[i]
              << " precision: " << PrecisionToStr(precision)
              << " layout: " << DataLayoutToStr(layout);
    // Prepare the device output tensors
@@ -182,11 +97,19 @@ bool SubgraphEngine::BuildDeviceProgram() {
        origin_otensors_[i]->mutable_data<int64_t>();
        break;
      default:
-        LOG(FATAL) << "[RKNPU] " << device_onames_[i]
+        LOG(FATAL) << "[RKNPU] " << output_names_[i]
                   << " can't mutable data with precision type "
                   << PrecisionToStr(precision);
        break;
    }
+    device_otensors_.push_back(node->data());
+  }
+  // Create the RKNPU model and set the input and output nodes
+  device_program_ = lite::rknpu::Device::Global().Build(
+      model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[RKNPU] Build model failed!";
+    return false;
  }
  return true;
 }
@@ -196,8 +119,8 @@ bool SubgraphEngine::LaunchDeviceProgram() {
  std::vector<rk::nn::InputInfo> inputs;
  std::vector<rk::nn::OutputInfo> outputs;

-  inputs.resize(device_itensors_.size());
-  for (size_t i = 0; i < device_itensors_.size(); i++) {
+  inputs.resize(origin_itensors_.size());
+  for (size_t i = 0; i < origin_itensors_.size(); i++) {
    inputs[i].index = i;
    inputs[i].buf = const_cast<void*>(origin_itensors_[i]->raw_data());
    inputs[i].size = origin_itensors_[i]->memory_size();
@@ -207,8 +130,8 @@ bool SubgraphEngine::LaunchDeviceProgram() {
    inputs[i].layout = rk::nn::DataLayoutType::NCHW;
  }

-  outputs.resize(device_otensors_.size());
-  for (size_t i = 0; i < device_otensors_.size(); i++) {
+  outputs.resize(origin_otensors_.size());
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
    outputs[i].index = i;
    outputs[i].buf = const_cast<void*>(origin_otensors_[i]->raw_data());
    outputs[i].size = origin_otensors_[i]->memory_size();
@@ -225,11 +148,11 @@ void SubgraphCompute::PrepareForRun() {
  LOG(INFO) << "[RKNPU]:PrepareForRun";
  auto& param = this->Param<param_t>();
  engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                   param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
  CHECK(engine_);
 }


--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
@@ -34,15 +34,18 @@ class SubgraphEngine : public subgraph::Engine {
 public:
  SubgraphEngine(KernelContext *ctx,
                 int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                 const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}

 protected:
-  bool PrepareWorkspaceForDeviceProgram() override;
  bool BuildDeviceProgram() override;
  bool LaunchDeviceProgram() override;


--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -53,10 +53,11 @@ bool SubgraphEngine::BuildDeviceProgram() {
  // IR graph
  subgraph::xpu::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
-  if (origin_program_.empty()) {
+  if (!origin_program_) {
    BuildOriginProgram();
  }
-  for (auto& inst : origin_program_) {
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
    op->CheckShape();
@@ -123,7 +124,7 @@ bool SubgraphEngine::BuildDeviceProgram() {
    auto node = graph.Get(device_inames_[i]);
    auto precision = node->precision();
    auto layout = node->layout();
-    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
+    origin_itensors_[i] = exec_scope_->FindMutableTensor(device_inames_[i]);
    CHECK(origin_itensors_[i]);
    origin_idims_[i] = origin_itensors_[i]->dims();
    VLOG(3) << "[XPU] Inputs[" << i << "] name: " << device_inames_[i]
@@ -147,7 +148,7 @@ bool SubgraphEngine::BuildDeviceProgram() {
    auto node = graph.Get(device_onames_[i]);
    auto precision = node->precision();
    auto layout = node->layout();
-    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
+    origin_otensors_[i] = exec_scope_->FindMutableTensor(device_onames_[i]);
    CHECK(origin_otensors_[i]);
    origin_odims_[i] = origin_otensors_[i]->dims();
    VLOG(3) << "[XPU] Outputs[" << i << "] name: " << device_onames_[i]
@@ -220,11 +221,11 @@ bool SubgraphEngine::LaunchDeviceProgram() {
 void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                   param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
  CHECK(engine_);
 }


--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -31,12 +31,16 @@ class SubgraphEngine : public subgraph::Engine {
 public:
  SubgraphEngine(KernelContext *ctx,
                 int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                 const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}

 protected:
  bool PrepareWorkspaceForDeviceProgram() override;

--- a/lite/model_parser/cpp/block_desc.cc
+++ b/lite/model_parser/cpp/block_desc.cc
@@ -24,6 +24,12 @@ VarDesc* BlockDesc::GetVar<VarDesc>(int32_t idx) {
  return &vars_[idx];
 }

+template <>
+VarDesc const* BlockDesc::GetVar<VarDesc>(int32_t idx) const {
+  CHECK_LT(idx, VarsSize()) << "idx >= vars.size()";
+  return &vars_[idx];
+}
+
 template <>
 VarDesc* BlockDesc::AddVar<VarDesc>() {
  vars_.emplace_back();
@@ -36,6 +42,12 @@ OpDesc* BlockDesc::GetOp<OpDesc>(int32_t idx) {
  return &ops_[idx];
 }

+template <>
+OpDesc const* BlockDesc::GetOp<OpDesc>(int32_t idx) const {
+  CHECK_LT(idx, OpsSize()) << "idx >= ops.size()";
+  return &ops_[idx];
+}
+
 template <>
 OpDesc* BlockDesc::AddOp<OpDesc>() {
  ops_.emplace_back();

--- a/lite/model_parser/cpp/block_desc.h
+++ b/lite/model_parser/cpp/block_desc.h
@@ -45,6 +45,9 @@ class BlockDesc : public BlockDescAPI {
  template <typename T>
  T* GetVar(int32_t idx);

+  template <typename T>
+  T const* GetVar(int32_t idx) const;
+
  std::vector<VarDesc>& GetVars() { return vars_; }

  template <typename T>
@@ -57,6 +60,9 @@ class BlockDesc : public BlockDescAPI {
  template <typename T>
  T* GetOp(int32_t idx);

+  template <typename T>
+  T const* GetOp(int32_t idx) const;
+
  template <typename T>
  T* AddOp();


--- a/lite/model_parser/cpp/program_desc.cc
+++ b/lite/model_parser/cpp/program_desc.cc
@@ -24,6 +24,12 @@ BlockDesc* ProgramDesc::GetBlock<BlockDesc>(int32_t idx) {
  return &blocks_[idx];
 }

+template <>
+BlockDesc const* ProgramDesc::GetBlock<BlockDesc>(int32_t idx) const {
+  CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()";
+  return &blocks_[idx];
+}
+
 template <>
 BlockDesc* ProgramDesc::AddBlock<BlockDesc>() {
  blocks_.emplace_back();

--- a/lite/model_parser/cpp/program_desc.h
+++ b/lite/model_parser/cpp/program_desc.h
@@ -36,6 +36,9 @@ class ProgramDesc : public ProgramDescAPI {
  template <typename T>
  T* GetBlock(int32_t idx);

+  template <typename T>
+  T const* GetBlock(int32_t idx) const;
+
  std::vector<BlockDesc>& GetBlocks() { return blocks_; }

  template <typename T>

--- a/lite/model_parser/desc_apis.h
+++ b/lite/model_parser/desc_apis.h
@@ -22,6 +22,11 @@
 namespace paddle {
 namespace lite {

+// The Index of first Block in Program. also called root block.
+constexpr int kRootBlockIdx = 0;
+// The Parent Index of root Block, this block does not exist.
+constexpr int kNoneBlockIdx = -1;
+
 /*
 * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc
 * classes should implement this.

--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
@@ -294,9 +294,9 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
    case proto::VarType::LOD_TENSOR_ARRAY:
      return desc_->type().tensor_array().tensor();
    default:
-      LOG(FATAL)
-          << "Getting 'tensor_desc' is not supported by the type of var %s."
-          << this->Name();
+      LOG(WARNING) << "Getting 'tensor_desc' is not supported by the type("
+                   << static_cast<int>(desc_->type().type()) << ") of var "
+                   << this->Name();
  }
  return framework::proto::VarDesc().type().lod_tensor().tensor();
 }
@@ -312,10 +312,9 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
      }
      return res;
    default:
-      LOG(FATAL)
-          << "Getting 'tensor_descs' is not supported by the type of var "
-             "%s."
-          << this->Name();
+      LOG(WARNING) << "Getting 'tensor_descs' is not supported by the type("
+                   << static_cast<int>(desc_->type().type()) << ") of var "
+                   << this->Name();
  }
  return std::vector<proto::VarType::TensorDesc>();
 }

--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -109,6 +109,7 @@ add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposal
 add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
 add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
 add_operator(pixel_shuffle_op extra SRCS pixel_shuffle_op.cc DEPS ${op_DEPS})
+add_operator(print_op extra SRCS print_op.cc DEPS ${op_DEPS})

 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})

--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
@@ -21,15 +21,15 @@ namespace lite {
 namespace operators {

 bool AssignOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
+  CHECK_OR_FALSE(param_.X || param_.X_array);
+  CHECK_OR_FALSE(param_.Out || param_.Out_array);
  return true;
 }

 bool AssignOpLite::InferShapeImpl() const {
-  if (param_.X != nullptr) {
+  if (param_.X) {
    param_.Out->Resize(param_.X->dims());
-  } else if (param_.X_array != nullptr) {
+  } else if (param_.X_array) {
    param_.Out_array->resize(param_.Out_array->size());
  } else {
    LOG(FATAL) << "x or x_array must be set.";

--- a/lite/operators/conditional_block_op.cc
+++ b/lite/operators/conditional_block_op.cc
@@ -20,35 +20,37 @@ namespace paddle {
 namespace lite {
 namespace operators {

-bool ConditionalBlockOpLite::CheckShape() const {
+bool ConditionalBlockOp::CheckShape() const {
  CHECK_OR_FALSE(param_.cond);
-  CHECK_OR_FALSE(param_.sub_block);
-  CHECK_OR_FALSE(param_.scope);
+  CHECK_OR_FALSE(param_.program_desc);
+  CHECK_OR_FALSE(param_.exec_scope);
  return true;
 }

-bool ConditionalBlockOpLite::InferShapeImpl() const { return true; }
+bool ConditionalBlockOp::InferShapeImpl() const { return true; }

-bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                        lite::Scope *scope) {
+bool ConditionalBlockOp::AttachImpl(const cpp::OpDesc& op_desc, Scope* scope) {
  auto condition = op_desc.Input("Cond").front();
  param_.cond = scope->FindVar(condition)->GetMutable<lite::Tensor>();
-
  auto inputs = op_desc.Input("Input");
-  for (auto var : inputs) {
-    param_.x.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  for (const auto& input : inputs) {
+    auto* var = scope->FindVar(input);
+    CHECK(var);
+    param_.inputs.push_back(var->GetMutable<lite::Tensor>());
  }
-
  auto outs = op_desc.Output("Out");
-  for (auto var : outs) {
-    param_.outs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  for (const auto& out : outs) {
+    auto* var = scope->FindVar(out);
+    CHECK(var);
+    param_.outs.push_back(var->GetMutable<lite::Tensor>());
  }
-
  param_.is_scalar_condition = op_desc.GetAttr<bool>("is_scalar_condition");
  // obtain sub_block in core program.cc
-  param_.sub_block = sub_block_;
-  param_.scope = scope;
-
+  CHECK(param_.program_desc);
+  param_.block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  CHECK_GE(param_.block_idx, 0);
+  param_.exec_scope = scope;
+  CHECK(param_.exec_scope);
  return true;
 }

@@ -57,4 +59,4 @@ bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc,
 }  // namespace paddle

 REGISTER_LITE_OP(conditional_block,
-                 paddle::lite::operators::ConditionalBlockOpLite);
+                 paddle::lite::operators::ConditionalBlockOp);
--- a/lite/operators/conditional_block_op.h
+++ b/lite/operators/conditional_block_op.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/op_lite.h"
@@ -23,27 +24,30 @@ namespace paddle {
 namespace lite {
 namespace operators {

-class ConditionalBlockOpLite : public OpLite {
+class ConditionalBlockOp : public OpLite {
 public:
-  ConditionalBlockOpLite() {}
-  explicit ConditionalBlockOpLite(const std::string &op_type)
-      : OpLite(op_type) {}
+  ConditionalBlockOp() {}
+  explicit ConditionalBlockOp(const std::string &op_type) : OpLite(op_type) {}

  bool CheckShape() const override;

  bool InferShapeImpl() const override;

-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }

  std::string DebugString() const override { return "conditional_block"; }

-  void SetSubBlock(cpp::BlockDesc *desc) { sub_block_ = desc; }
+  void SetProgramDesc(std::shared_ptr<const cpp::ProgramDesc> program_desc) {
+    param_.program_desc = program_desc;
+  }
+  std::shared_ptr<const cpp::ProgramDesc> GetProgramDesc() {
+    return param_.program_desc;
+  }

 private:
  mutable ConditionalBlockParam param_;
-  cpp::BlockDesc *sub_block_;
 };

 }  // namespace operators

--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -21,7 +21,7 @@
 #include "lite/core/scope.h"
 #include "lite/core/tensor.h"
 #include "lite/core/types.h"
-#include "lite/model_parser/cpp/block_desc.h"
+#include "lite/model_parser/cpp/program_desc.h"
 #include "lite/model_parser/desc_apis.h"
 #include "lite/utils/all.h"
 #include "lite/utils/variant.h"
@@ -91,9 +91,9 @@ struct SubgraphParam : ParamBase {
  std::vector<std::string> output_names{};
  std::vector<std::string> input_data_names{};
  std::vector<std::string> output_data_names{};
-  int sub_block_idx{-1};
-  cpp::BlockDesc* sub_block_desc{nullptr};
-  Scope* scope{nullptr};
+  int block_idx{-1};
+  std::shared_ptr<const cpp::ProgramDesc> program_desc{nullptr};
+  Scope* exec_scope{nullptr};
 };

 /// -------------------------- NN operators ------------------------------------
@@ -939,11 +939,10 @@ struct CompareParam : ParamBase {
 };

 struct WhileParam : ParamBase {
-  Scope* scope{};
  Tensor* cond{};
-  cpp::BlockDesc* sub_block{};
-  std::vector<Tensor*> x{};
-  std::vector<Tensor*> outs{};
+  int block_idx{-1};
+  std::shared_ptr<const cpp::ProgramDesc> program_desc{nullptr};
+  Scope* exec_scope{nullptr};
 };

 struct TopkParam : ParamBase {
@@ -1396,10 +1395,11 @@ struct MergeLodTensorParam : ParamBase {

 struct ConditionalBlockParam : ParamBase {
  const lite::Tensor* cond{};
-  std::vector<lite::Tensor*> x{};
+  std::vector<lite::Tensor*> inputs{};
  std::vector<lite::Tensor*> outs{};
-  cpp::BlockDesc* sub_block{};
-  Scope* scope{};
+  int block_idx{-1};
+  std::shared_ptr<const cpp::ProgramDesc> program_desc{nullptr};
+  Scope* exec_scope{nullptr};
  bool is_scalar_condition{};
 };

@@ -1520,6 +1520,23 @@ struct PixelShuffleParam : ParamBase {
  lite::Tensor* output{nullptr};
  int upscale_factor{1};
 };
+
+struct PrintParam : ParamBase {
+  const lite::Tensor* in{};
+  lite::Tensor* out{};
+  std::string name;
+  int first_n{-1};
+  std::string message;
+  int summarize{20};
+  bool print_tensor_name{true};
+  bool print_tensor_type{true};
+  bool print_tensor_shape{true};
+  bool print_tensor_lod{true};
+  bool print_tensor_layout{true};
+  std::string print_phase;
+  bool is_forward{true};
+};
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
--- a/lite/operators/print_op.cc
+++ b/lite/operators/print_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/print_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool PrintOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.in);
+  CHECK_OR_FALSE(param_.out);
+  return true;
+}
+
+bool PrintOp::InferShapeImpl() const {
+  param_.out->set_lod(param_.in->lod());
+  param_.out->Resize(param_.in->dims());
+  return true;
+}
+
+bool PrintOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  AttachParam(&param_);
+
+  param_.name = op_desc.Input("In").front();
+  param_.in = scope->FindTensor(param_.name);
+  param_.out = scope->FindMutableTensor(op_desc.Output("Out").front());
+  param_.first_n = op_desc.GetAttr<int32_t>("first_n");
+  param_.message = op_desc.GetAttr<std::string>("message");
+  param_.summarize = op_desc.GetAttr<int32_t>("summarize");
+  param_.print_tensor_name = op_desc.GetAttr<bool>("print_tensor_name");
+  param_.print_tensor_type = op_desc.GetAttr<bool>("print_tensor_type");
+  param_.print_tensor_shape = op_desc.GetAttr<bool>("print_tensor_shape");
+  param_.print_tensor_lod = op_desc.GetAttr<bool>("print_tensor_lod");
+  param_.print_tensor_layout = op_desc.GetAttr<bool>("print_tensor_layout");
+  param_.print_phase = op_desc.GetAttr<std::string>("print_phase");
+  param_.is_forward = op_desc.GetAttr<bool>("is_forward");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(print, paddle::lite::operators::PrintOp);
--- a/lite/operators/print_op.h
+++ b/lite/operators/print_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class PrintOp : public OpLite {
+ public:
+  PrintOp() {}
+  explicit PrintOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "print"; }
+
+ private:
+  mutable PrintParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/subgraph_op.cc
+++ b/lite/operators/subgraph_op.cc
@@ -39,10 +39,11 @@ bool SubgraphOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
      op_desc.GetAttr<std::vector<std::string>>("input_data_names");
  param_.output_data_names =
      op_desc.GetAttr<std::vector<std::string>>("output_data_names");
-  CHECK(param_.sub_block_desc);
-  param_.sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
-  param_.scope = scope;
-  CHECK(param_.scope);
+  CHECK(param_.program_desc);
+  param_.block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  CHECK_GE(param_.block_idx, 0);
+  param_.exec_scope = scope;
+  CHECK(param_.exec_scope);
  return true;
 }


--- a/lite/operators/subgraph_op.h
+++ b/lite/operators/subgraph_op.h
@@ -13,14 +13,11 @@
 // limitations under the License.

 #pragma once
-
+#include <memory>
 #include <string>
 #include <vector>
-#include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
 #include "lite/utils/all.h"

 namespace paddle {
@@ -37,14 +34,18 @@ class SubgraphOp : public OpLite {

  bool InferShapeImpl() const override;

-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+  bool AttachImpl(const cpp::OpDesc &op_desc, Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }

  std::string DebugString() const override { return "subgraph"; }

-  void SetSubBlock(cpp::BlockDesc *desc) { param_.sub_block_desc = desc; }
-  cpp::BlockDesc *GetSubBlock() { return param_.sub_block_desc; }
+  void SetProgramDesc(std::shared_ptr<const cpp::ProgramDesc> program_desc) {
+    param_.program_desc = program_desc;
+  }
+  std::shared_ptr<const cpp::ProgramDesc> GetProgramDesc() {
+    return param_.program_desc;
+  }

 private:
  mutable SubgraphParam param_;

--- a/lite/operators/while_op.cc
+++ b/lite/operators/while_op.cc
@@ -20,31 +20,23 @@ namespace paddle {
 namespace lite {
 namespace operators {

-bool WhileOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.sub_block);
-  CHECK_OR_FALSE(param_.scope);
+bool WhileOp::CheckShape() const {
  CHECK_OR_FALSE(param_.cond);
+  CHECK_OR_FALSE(param_.program_desc);
+  CHECK_OR_FALSE(param_.exec_scope);
  return true;
 }

-bool WhileOpLite::InferShapeImpl() const { return true; }
-
-bool WhileOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto inputs = op_desc.Input("X");
-  auto outs = op_desc.Output("Out");
-
-  for (auto var : inputs) {
-    // param_.x.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  for (auto var : outs) {
-    // param_.outs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  param_.sub_block = sub_block_;
+bool WhileOp::InferShapeImpl() const { return true; }

+bool WhileOp::AttachImpl(const cpp::OpDesc &op_desc, Scope *scope) {
  auto condition = op_desc.Input("Condition");
  param_.cond = scope->FindVar(condition[0])->GetMutable<lite::Tensor>();
-  param_.scope = scope;
-
+  CHECK(param_.program_desc);
+  param_.block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  CHECK_GE(param_.block_idx, 0);
+  param_.exec_scope = scope;
+  CHECK(param_.exec_scope);
  return true;
 }

@@ -52,4 +44,4 @@ bool WhileOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_OP(while, paddle::lite::operators::WhileOpLite);
+REGISTER_LITE_OP(while, paddle::lite::operators::WhileOp);
--- a/lite/operators/while_op.h
+++ b/lite/operators/while_op.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/op_lite.h"
@@ -23,24 +24,30 @@ namespace paddle {
 namespace lite {
 namespace operators {

-class WhileOpLite : public OpLite {
+class WhileOp : public OpLite {
 public:
-  WhileOpLite() {}
-  explicit WhileOpLite(const std::string &op_type) : OpLite(op_type) {}
+  WhileOp() {}
+  explicit WhileOp(const std::string &op_type) : OpLite(op_type) {}

  bool CheckShape() const override;

  bool InferShapeImpl() const override;

-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
  std::string DebugString() const override { return "while"; }
-  void SetSubBlock(cpp::BlockDesc *desc) { sub_block_ = desc; }
+
+  void SetProgramDesc(std::shared_ptr<const cpp::ProgramDesc> program_desc) {
+    param_.program_desc = program_desc;
+  }
+  std::shared_ptr<const cpp::ProgramDesc> GetProgramDesc() {
+    return param_.program_desc;
+  }

 private:
  mutable WhileParam param_;
-  cpp::BlockDesc *sub_block_;
 };

 }  // namespace operators

--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
+if(LITE_WITH_ARM)
+    lite_cc_test(test_transformer_with_mask_fp32_arm SRCS test_transformer_with_mask_fp32_arm.cc
+      DEPS ${lite_model_test_DEPS} paddle_api_full
+      ARM_DEPS ${arm_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/transformer_with_mask_fp32 SERIAL)
+  if(WITH_TESTING)
+      add_dependencies(test_transformer_with_mask_fp32_arm extern_lite_download_transformer_with_mask_fp32_tar_gz)
+  endif()
+endif()
+
 if(LITE_WITH_XPU)
    lite_cc_test(test_resnet50_lite_xpu SRCS test_resnet50_lite_xpu.cc
      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils

--- a/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
+++ b/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -387,7 +387,7 @@ function test_arm_android {
    echo "test name: ${test_name}"
    adb_work_dir="/data/local/tmp"

-    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl")
+    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl" "test_transformer_with_mask_fp32_arm")
    for skip_name in ${skip_list[@]} ; do
        [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
    done
@@ -1152,6 +1152,7 @@ function main {
                build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
                build_test_arm_subtask_model test_resnet50 resnet50
                build_test_arm_subtask_model test_inceptionv4 inception_v4_simple
+                build_test_arm_subtask_model test_transformer_with_mask_fp32_arm transformer_with_mask_fp32
                shift
                ;;
            build_test_arm_subtask_armlinux)

--- a/lite/utils/env.h
+++ b/lite/utils/env.h
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h