[NPU] save subgraph model cache (#3589)

aa05c93e · zhupengyang · GitHub · 950b7382 · aa05c93e · aa05c93e
10 changed file
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -36,6 +36,11 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  }
  mode_ = config.power_mode();
  threads_ = config.threads();
+#ifdef LITE_WITH_NPU
+  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      config.subgraph_model_cache_dir());
+#endif
 }
 std::unique_ptr<lite_api::Tensor> LightPredictorImpl::GetInput(int i) {

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -118,18 +118,27 @@ class LITE_API ConfigBase {
  std::string model_dir_;
  int threads_{1};
  PowerMode mode_{LITE_POWER_NO_BIND};
+  // to save subgraph model for npu/xpu/...
+  std::string subgraph_model_cache_dir_{""};
 public:
  explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
  // set Model_dir
  void set_model_dir(const std::string& x) { model_dir_ = x; }
  const std::string& model_dir() const { return model_dir_; }
-  // set Power_mode
-  void set_power_mode(PowerMode mode);
-  PowerMode power_mode() const { return mode_; }
  // set Thread
  void set_threads(int threads);
  int threads() const { return threads_; }
+  // set Power_mode
+  void set_power_mode(PowerMode mode);
+  PowerMode power_mode() const { return mode_; }
+  // set subgraph_model_dir
+  void set_subgraph_model_cache_dir(std::string subgraph_model_cache_dir) {
+    subgraph_model_cache_dir_ = subgraph_model_cache_dir;
+  }
+  const std::string& subgraph_model_cache_dir() const {
+    return subgraph_model_cache_dir_;
+  }
 };
 /// CxxConfig is the config for the Full feature predictor.

--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -14,15 +14,50 @@
 #include "lite/backends/npu/device.h"
 #include "lite/utils/cp_logging.h"
+#include "lite/utils/io.h"
 namespace paddle {
 namespace lite {
 namespace npu {
+bool WriteToOMFile(const domi::ModelBufferData& om_model_buff,
+                   std::string om_file_path) {
+  FILE* fp;
+  fp = fopen(om_file_path.c_str(), "wb");
+  CHECK(fp != nullptr) << om_file_path << " open failed!";
+  uint32_t write_size =
+      (uint32_t)fwrite(om_model_buff.data, 1, om_model_buff.length, fp);
+  CHECK_EQ(write_size, om_model_buff.length) << "write om file failed !";
+  fclose(fp);
+  return true;
+}
+bool ReadFromOMFile(domi::ModelBufferData* om_model_buff,
+                    std::string om_file_path) {
+  FILE* fp;
+  fp = fopen(om_file_path.c_str(), "rb");
+  CHECK(fp != nullptr) << om_file_path << " open failed!";
+  fseek(fp, 0, SEEK_END);
+  uint32_t model_length = (uint32_t)ftell(fp);
+  fseek(fp, 0, SEEK_SET);
+  om_model_buff->data = malloc(model_length);
+  om_model_buff->length = model_length;
+  uint32_t read_size =
+      (uint32_t)fread(om_model_buff->data, 1, model_length, fp);
+  CHECK_EQ(read_size, model_length) << "read om file failed !";
+  fclose(fp);
+  return true;
+}
 std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
-    const std::string model_name,            // NOLINT
+    const std::string model_name,                // NOLINT
-    std::vector<ge::Operator>& input_nodes,  // NOLINT
+    std::vector<ge::Operator>& input_nodes,      // NOLINT
-    std::vector<ge::Operator>& output_nodes  // NOLINT
+    std::vector<ge::Operator>& output_nodes,     // NOLINT
+    const std::string model_cache_full_dir = ""  // NOLINT
    ) {
  VLOG(3) << "[NPU] Build model";
  // Build the HiAI IR graph to the HiAI om model
@@ -32,14 +67,24 @@ std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
  om_model.SetGraph(ir_graph);
  domi::HiaiIrBuild ir_build;
  domi::ModelBufferData om_model_buf;
-  if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
-    LOG(WARNING) << "[NPU] CreateModelBuff failed!";
+  if (!model_cache_full_dir.empty() && IsFileExists(model_cache_full_dir)) {
-    return nullptr;
+    VLOG(3) << "Will read om model from " << model_cache_full_dir;
-  }
+    ReadFromOMFile(&om_model_buf, model_cache_full_dir);
-  if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
+  } else {
-    LOG(WARNING) << "[NPU] BuildIRModel failed!";
+    if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
-    ir_build.ReleaseModelBuff(om_model_buf);
+      LOG(WARNING) << "[NPU] CreateModelBuff failed!";
-    return nullptr;
+      return nullptr;
+    }
+    if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
+      LOG(WARNING) << "[NPU] BuildIRModel failed!";
+      ir_build.ReleaseModelBuff(om_model_buf);
+      return nullptr;
+    }
+    if (!model_cache_full_dir.empty()) {
+      VLOG(3) << "Will write om model to " << model_cache_full_dir;
+      WriteToOMFile(om_model_buf, model_cache_full_dir);
+    }
  }
  // Create a HiAI model manager client to load the HiAI om model

--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
@@ -41,10 +41,11 @@ class Device {
  // Build the HiAI IR graph to om model, return HiAI model manager client to
  // load om model and run inference.
  std::shared_ptr<hiai::AiModelMngerClient> Build(
-      const std::string model_name,            // NOLINT
+      const std::string model_name,             // NOLINT
-      std::vector<ge::Operator>& input_nodes,  // NOLINT
+      std::vector<ge::Operator>& input_nodes,   // NOLINT
-      std::vector<ge::Operator>& output_nodes  // NOLINT
+      std::vector<ge::Operator>& output_nodes,  // NOLINT
-      );                                       // NOLINT
+      const std::string model_cache_name        // NOLINT
+      );                                        // NOLINT
 private:
  int freq_level_{3};

--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -17,6 +17,10 @@
 namespace paddle {
 namespace lite {
+#ifdef LITE_WITH_NPU
+std::string Context<TargetType::kNPU>::subgraph_model_cache_dir_{""};  // NOLINT
+#endif
 #ifdef LITE_WITH_XPU
 std::string Context<TargetType::kXPU>::_multi_encoder_precision;  // NOLINT
 thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -85,6 +85,16 @@ class Context<TargetType::kNPU> {
  NPUContext& operator=(const NPUContext& ctx) {}
  std::string name() const { return "NPUContext"; }
+  static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) {
+    subgraph_model_cache_dir_ = subgraph_model_cache_dir;
+  }
+  static std::string SubgraphModelCacheDir() {
+    return subgraph_model_cache_dir_;
+  }
+ private:
+  static std::string subgraph_model_cache_dir_;
 };
 #endif

--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -132,6 +132,7 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
  mobile_config.set_model_from_file(optimized_model_dir + ".nb");
  mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
  mobile_config.set_threads(1);
+  // mobile_config.set_subgraph_model_cache_dir("/data/local/tmp");
  predictor = lite_api::CreatePaddlePredictor(mobile_config);
  FillInputTensors(predictor, input_tensor_shape, input_tensor_type, 1);
  // Run optimized model

--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
@@ -33,13 +33,15 @@ class Engine {
         cpp::BlockDesc *block_desc,
         const std::vector<std::string> &input_names,
         const std::vector<std::string> &output_names,
-         lite::Scope *scope)
+         lite::Scope *scope,
+         std::string model_cache_dir = "")
      : ctx_(ctx),
        block_idx_(block_idx),
        block_desc_(block_desc),
        input_names_(input_names),
        output_names_(output_names),
-        scope_(scope) {}
+        scope_(scope),
+        model_cache_dir_(model_cache_dir) {}
  virtual ~Engine() = default;
  virtual int Build();
@@ -73,6 +75,7 @@ class Engine {
  std::vector<Tensor *> origin_itensors_;
  std::vector<Tensor *> origin_otensors_;
  std::vector<Instruction> origin_program_;
+  std::string model_cache_dir_{""};
 };
 }  // namespace subgraph

--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/npu/subgraph_compute.h"
 #include <sys/time.h>
 #include <time.h>
+#include <algorithm>
 #include <utility>
 #include "hiai_ir_build.h"  // NOLINT
 #include "lite/backends/npu/device.h"
@@ -22,12 +23,41 @@
 #include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/paddle_use_bridges.h"
 #include "lite/kernels/npu/bridges/utility.h"
+#include "lite/utils/io.h"
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace npu {
+std::string SubgraphEngine::GenerateModelCacheName() const {
+  auto inames = device_inames_;
+  auto onames = device_onames_;
+  std::sort(inames.begin(), inames.end());
+  std::sort(onames.begin(), onames.end());
+  std::string model_cache_name = "";
+  for (auto iname : inames) {
+    auto itensor = scope_->FindTensor(iname);
+    std::replace(iname.begin(), iname.end(), '/', '_');
+    model_cache_name += "_" + iname;
+    for (auto i : itensor->dims().Vectorize()) {
+      model_cache_name += "_" + std::to_string(i);
+    }
+  }
+  for (auto oname : onames) {
+    auto otensor = scope_->FindTensor(oname);
+    std::replace(oname.begin(), oname.end(), '/', '_');
+    model_cache_name += "_" + oname;
+    for (auto i : otensor->dims().Vectorize()) {
+      model_cache_name += "_" + std::to_string(i);
+    }
+  }
+  model_cache_name += "_.om";
+  return model_cache_name;
+}
 int SubgraphEngine::BuildDeviceProgram() {
  int status = 0;
  // Convert all of ops and their input vars and weights and added into the NPU
@@ -88,8 +118,11 @@ int SubgraphEngine::BuildDeviceProgram() {
  if (device_program_map_.count(inputs_shape_) > 0) {
    return status;
  }
+  std::string model_cache_full_dir =
+      model_cache_dir_.empty() ? "" : model_cache_dir_ + "/" +
+                                          GenerateModelCacheName();
  auto device_client = lite::npu::Device::Global().Build(
-      model_name_, device_inodes, device_onodes);
+      model_name_, device_inodes, device_onodes, model_cache_full_dir);
  if (device_client == nullptr) {
    LOG(WARNING) << "[NPU] Build model failed!";
    return subgraph::FAILED;
@@ -280,7 +313,8 @@ void SubgraphCompute::PrepareForRun() {
                                   param.sub_block_desc,
                                   param.input_data_names,
                                   param.output_data_names,
-                                   param.scope));
+                                   param.scope,
+                                   NPUContext::SubgraphModelCacheDir()));
  CHECK(engine_);
  engine_->Build();
 }

--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -35,9 +35,15 @@ class SubgraphEngine : public subgraph::Engine {
                 cpp::BlockDesc *block_desc,
                 const std::vector<std::string> &input_names,
                 const std::vector<std::string> &output_names,
-                 Scope *scope)
+                 Scope *scope,
-      : subgraph::Engine(
+                 std::string model_cache_dir = "")
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         block_desc,
+                         input_names,
+                         output_names,
+                         scope,
+                         model_cache_dir) {}
  struct device_program_t {
    explicit device_program_t(std::shared_ptr<hiai::AiModelMngerClient> _client)
@@ -58,6 +64,8 @@ class SubgraphEngine : public subgraph::Engine {
  void InitDeviceTensor() override;
  bool InputShapeChanged() override;
+  std::string GenerateModelCacheName() const;
  std::string model_name_{"model.om"};
  std::vector<std::vector<int64_t>> inputs_shape_{};
  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>