diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc
index d62ac9cad3e5ab4e6f63e3b667e3fa93e244fec1..345b239c320f04eba8426483a23a352e77a71036 100644
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -19,8 +19,8 @@ namespace paddle {
 namespace lite {
 namespace npu {
 
-std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
-    std::string& model_name,                 // NOLINT
+std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
+    const std::string model_name,            // NOLINT
     std::vector<ge::Operator>& input_nodes,  // NOLINT
     std::vector<ge::Operator>& output_nodes  // NOLINT
     ) {
@@ -41,15 +41,15 @@ std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
     ir_build.ReleaseModelBuff(om_model_buf);
     return nullptr;
   }
+
   // Create a HiAI model manager client to load the HiAI om model
-  std::unique_ptr<hiai::AiModelMngerClient> model_client(
+  std::shared_ptr<hiai::AiModelMngerClient> model_client(
       new hiai::AiModelMngerClient());
   if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
     LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
     ir_build.ReleaseModelBuff(om_model_buf);
     return nullptr;
   }
-  model_name = "model_" + std::to_string(model_count_++) + ".om";
   auto model_desc = std::make_shared<hiai::AiModelDescription>(
       model_name, freq_level(), framework_type(), model_type(), device_type());
   model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h
index 411600ae0a38e4ee1b4a3ce3d6519b927eeb0a1a..6733a7f6dfa085d2c64274a81ba2a028ebe88f3f 100644
--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
@@ -40,8 +40,8 @@ class Device {
 
   // Build the HiAI IR graph to om model, return HiAI model manager client to
   // load om model and run inference.
-  std::unique_ptr<hiai::AiModelMngerClient> Build(
-      std::string& model_name,                 // NOLINT
+  std::shared_ptr<hiai::AiModelMngerClient> Build(
+      const std::string model_name,            // NOLINT
       std::vector<ge::Operator>& input_nodes,  // NOLINT
       std::vector<ge::Operator>& output_nodes  // NOLINT
       );                                       // NOLINT
@@ -51,7 +51,6 @@ class Device {
   int framework_type_{0};
   int model_type_{0};
   int device_type_{0};
-  int model_count_{0};
 };
 
 }  // namespace npu
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
index 994f346ced7c9f5567fdd2ab86e741a25363e881..c976a57c33ec6bde89705bf72c71f5c483d44c59 100644
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -407,7 +407,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   cpp::OpDesc subgraph_op_desc;
   subgraph_op_desc.SetType("subgraph");
 
-  // Create a new sub block desc for storing all of Ops an Vars of the target
+  // Create a new sub block desc for storing all of Ops and Vars of the target
   // subgraph and sub_block_idx is set as a attribute of subgraph op,
   // sub_block_idx < 0 means it's a new subgraph op
   int sub_block_idx = -(subgraph_idx + 1);
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index 7cf65a23b8c5646c8ff6c77917dde53b7f036b9c..7117e1b3399fe823194f7f1a4d4c239099580955 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -17,6 +17,7 @@
 #include "lite/api/paddle_api.h"
 #include "lite/api/test_helper.h"
 #include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
 
 DEFINE_string(model_file, "", "model file path of combined protobuf model");
 DEFINE_string(params_file, "", "params file path of combined protobuf model");
@@ -31,43 +32,17 @@ namespace lite {
 // The helper functions for loading and running model from command line and
 // verifying output data
 std::vector<std::string> TypeParsing(std::string text) {
-  std::vector<std::string> types;
-  while (!text.empty()) {
-    size_t index = text.find_first_of(":");
-    std::string type = text.substr(0, index);
-    VLOG(3) << type;
-    types.push_back(type);
-    if (index == std::string::npos) {
-      break;
-    } else {
-      text = text.substr(index + 1);
-    }
-  }
-  return types;
+  return Split(text, ":");
 }
 
 std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
   std::vector<std::vector<int64_t>> shapes;
-  while (!text.empty()) {
-    size_t index = text.find_first_of(":");
-    std::string slice = text.substr(0, index);
-    std::vector<int64_t> shape;
-    while (!slice.empty()) {
-      size_t index = slice.find_first_of(",");
-      int d = atoi(slice.substr(0, index).c_str());
-      VLOG(3) << d;
-      shape.push_back(d);
-      if (index == std::string::npos) {
-        break;
-      } else {
-        slice = slice.substr(index + 1);
-      }
-    }
-    shapes.push_back(shape);
-    if (index == std::string::npos) {
-      break;
-    } else {
-      text = text.substr(index + 1);
+  std::vector<std::string> shape_strings = Split(text, ":");
+  shapes.resize(shape_strings.size());
+  for (int i = 0; i < shape_strings.size(); i++) {
+    std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
+    for (auto shape_num : shape_nums) {
+      shapes[i].push_back(atoi(shape_num.c_str()));
     }
   }
   return shapes;
diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc
index 770ea345b633034972cb71cb4f1236ecefff36d7..d7b14a9319951eb827cbc9d346ee8e59e9571aee 100644
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -85,22 +85,31 @@ int SubgraphEngine::BuildDeviceProgram() {
       << "[NPU] No input nodes found for building NPU model";
   CHECK(!device_onames_.empty())
       << "[NPU] No output nodes found for building NPU model";
+
   // Build the HiAI IR graph to HiAI om model as the device program
-  device_program_ = lite::npu::Device::Global().Build(
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return status;
+  }
+  auto device_client = lite::npu::Device::Global().Build(
       model_name_, device_inodes, device_onodes);
-  if (device_program_ == nullptr) {
+  if (device_client == nullptr) {
     LOG(WARNING) << "[NPU] Build model failed!";
     return subgraph::FAILED;
   }
+  auto device_program = std::make_shared<device_program_t>(device_client);
+  device_program_map_[inputs_shape_] = device_program;
 
   // Query and check the dimensions of valid input and output tensors
   std::vector<hiai::TensorDimension> device_idims, device_odims;
-  if (device_program_->GetModelIOTensorDim(
+  if (device_program->client->GetModelIOTensorDim(
           model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
     LOG(WARNING)
         << "[NPU] Get the dimensions of input and output tensors failed!";
     return subgraph::FAILED;
   }
+  device_program->device_idims = device_idims;
+  device_program->device_odims = device_odims;
+
   CHECK_EQ(device_idims.size(), device_inames_.size());
   CHECK_EQ(device_odims.size(), device_onames_.size());
   origin_idims_.resize(device_inames_.size());
@@ -109,6 +118,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   origin_odims_.resize(device_onames_.size());
   origin_otensors_.resize(device_onames_.size());
   device_otensors_.resize(device_onames_.size());
+
   for (int i = 0; i < device_inames_.size(); i++) {
     auto node = graph.Get(device_inames_[i]);
     auto precision = node->precision();
@@ -130,6 +140,8 @@ int SubgraphEngine::BuildDeviceProgram() {
     device_itensors_[i].reset(new hiai::AiTensor);
     device_itensors_[i]->Init(&(device_idims[i]));
   }
+  device_program->origin_idims = origin_idims_;
+
   for (int i = 0; i < device_onames_.size(); i++) {
     auto node = graph.Get(device_onames_[i]);
     auto precision = node->precision();
@@ -170,6 +182,8 @@ int SubgraphEngine::BuildDeviceProgram() {
                    << PrecisionToStr(precision);
         break;
     }
+    device_program->origin_odims = origin_odims_;
+
     CHECK_EQ(origin_odims_[i].production(),
              device_odims[i].GetNumber() * device_odims[i].GetChannel() *
                  device_odims[i].GetHeight() * device_odims[i].GetWidth());
@@ -181,14 +195,25 @@ int SubgraphEngine::BuildDeviceProgram() {
 
 int SubgraphEngine::LaunchDeviceProgram() {
   // Copy the data of origin input tensors to the buffer of input HiAI tensors
+  // init device_itensors_, device_otensors_, origin_otensors_
+  auto device_program = device_program_map_[inputs_shape_];
   for (size_t i = 0; i < device_itensors_.size(); i++) {
+    device_itensors_[i]->Init(&(device_program->device_idims[i]));
     std::memcpy(device_itensors_[i]->GetBuffer(),
                 origin_itensors_[i]->raw_data(),
                 origin_itensors_[i]->memory_size());
   }
+  for (size_t i = 0; i < device_otensors_.size(); i++) {
+    device_otensors_[i]->Init(&(device_program->device_odims[i]));
+  }
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
+    origin_otensors_[i]->Resize(device_program->origin_odims[i]);
+  }
+
   // Run the HiAI model by name
   std::string key = "model_name";  // Note: key seems must be model_name
-  model_context_.AddPara(key, model_name_);
+  hiai::AiContext model_context;
+  model_context.AddPara(key, model_name_);
   auto GetCurrentUS = []() -> double {
     struct timeval time;
     gettimeofday(&time, NULL);
@@ -196,11 +221,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
   };
   int istamp;
   auto start_time = GetCurrentUS();
-  CHECK_EQ(
-      device_program_->Process(
-          model_context_, device_itensors_, device_otensors_, 1000, istamp),
-      hiai::AI_SUCCESS);
+  CHECK_EQ(device_program->client->Process(
+               model_context, device_itensors_, device_otensors_, 1000, istamp),
+           hiai::AI_SUCCESS);
   VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
+
   // Copy the data of output HiAI tensor to the buffer of origin output tensors
   for (size_t i = 0; i < device_otensors_.size(); i++) {
     std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
@@ -210,6 +235,18 @@ int SubgraphEngine::LaunchDeviceProgram() {
   return 0;
 }
 
+bool SubgraphEngine::InputShapeChanged() {
+  std::vector<std::vector<int64_t>> new_shape;
+  for (auto origin_itensor : origin_itensors_) {
+    new_shape.push_back(origin_itensor->dims().Vectorize());
+  }
+  inputs_shape_ = new_shape;
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return false;
+  }
+  return true;
+}
+
 void SubgraphCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h
index 29aeb01cdb50e2a9dd6d066a2f11106fd4cb20fb..801f61b0365c03d59c36e2a62ac3c2bb61f46607 100644
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -38,17 +39,29 @@ class SubgraphEngine : public subgraph::Engine {
       : subgraph::Engine(
             ctx, block_idx, block_desc, input_names, output_names, scope) {}
 
+  struct device_program_t {
+    explicit device_program_t(std::shared_ptr<hiai::AiModelMngerClient> _client)
+        : client(_client) {}
+    std::shared_ptr<hiai::AiModelMngerClient> client{nullptr};
+    std::vector<DDim> origin_idims{};
+    std::vector<DDim> origin_odims{};
+    std::vector<hiai::TensorDimension> device_idims{};
+    std::vector<hiai::TensorDimension> device_odims{};
+  };
+
  protected:
   int BuildDeviceProgram() override;
   int LaunchDeviceProgram() override;
+  bool InputShapeChanged() override;
 
-  std::string model_name_;
-  hiai::AiContext model_context_;
-  std::vector<std::string> device_inames_;
-  std::vector<std::string> device_onames_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_;
-  std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr};
+  std::string model_name_{"model.om"};
+  std::vector<std::vector<int64_t>> inputs_shape_{};
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
+      device_program_map_{};
+  std::vector<std::string> device_inames_{};
+  std::vector<std::string> device_onames_{};
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_{};
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_{};
 };
 
 class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> {