[NPU] apply npu cache offline model to other devices, test=develop (#3925)

* [NPU] apply npu cache offline model to other devices, test=develop * [NPU] address review comments, test=develop

[NPU] apply npu cache offline model to other devices, test=develop (#3925)
* [NPU] apply npu cache offline model to other devices, test=develop * [NPU] address review comments, test=develop
e6b3d883 · Qi Li · GitHub · b85bc6e5 · e6b3d883 · e6b3d883
9 changed file
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -28,7 +28,7 @@ namespace lite {
 namespace kernels {
 namespace apu {
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::BuildDeviceProgram() {
  unsigned int version;
  Neuron_getVersion(&version);
  VLOG(3) << "Neuron Adapter version: " << version;
@@ -38,7 +38,7 @@ int SubgraphEngine::BuildDeviceProgram() {
  int neuron_errCode = NeuronModel_create(&model_);
  if (NEURON_NO_ERROR != neuron_errCode) {
    LOG(WARNING) << "Fail to create model";
-    return subgraph::FAILED;
+    return false;
  }
  graph.set_model(model_);
  graph.set_input_names(input_names_);
@@ -46,6 +46,9 @@ int SubgraphEngine::BuildDeviceProgram() {
  // Convert all of ops and their input vars and weights and added into the APU
  // NIR graph
+  if (origin_program_.empty()) {
+    BuildOriginProgram();
+  }
  const auto& bridges = subgraph::Registry::Instance();
  for (auto& inst : origin_program_) {
    auto op = const_cast<OpLite*>(inst.op());
@@ -54,7 +57,7 @@ int SubgraphEngine::BuildDeviceProgram() {
    op->InferShape();
    std::string op_type = op->op_info()->Type();
    if (!bridges.Exists(op_type, TARGET(kAPU))) {
-      return subgraph::FAILED;
+      return false;
    }
    auto kernel = inst.kernel();
@@ -63,7 +66,7 @@ int SubgraphEngine::BuildDeviceProgram() {
                                              const_cast<OpLite*>(op),
                                              const_cast<KernelBase*>(kernel));
    if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
    }
  }
@@ -84,7 +87,7 @@ int SubgraphEngine::BuildDeviceProgram() {
      VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index();
    } else {
      LOG(WARNING) << "Fail to find input: " << input_names_[i];
-      return subgraph::FAILED;
+      return false;
    }
  }
@@ -105,7 +108,7 @@ int SubgraphEngine::BuildDeviceProgram() {
      VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index();
    } else {
      LOG(WARNING) << "Fail to find output: " << output_names_[i];
-      return subgraph::FAILED;
+      return false;
    }
  }
@@ -116,7 +119,7 @@ int SubgraphEngine::BuildDeviceProgram() {
  neuron_errCode = NeuronModel_finish(model_);
  if (NEURON_NO_ERROR != neuron_errCode) {
    LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode;
-    return subgraph::FAILED;
+    return false;
  }
  VLOG(3) << "[APU] APU NIR model created!";
@@ -129,15 +132,14 @@ int SubgraphEngine::BuildDeviceProgram() {
  compilation_ = lite::apu::Device::Global().Build(model_);
  if (compilation_ == nullptr) {
    LOG(WARNING) << "[APU] Build APU DLA model failed!";
-    return subgraph::FAILED;
+    return false;
  }
  VLOG(3) << "[APU] APU DLA model created, Build cost "
          << GetCurrentUS() - start_time << " us";
+  return true;
-  return status;
 }
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
  auto GetCurrentUS = []() -> double {
    struct timeval time;
    gettimeofday(&time, NULL);
@@ -149,7 +151,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
  int neuron_errCode = NeuronExecution_create(compilation_, &run);
  if (NEURON_NO_ERROR != neuron_errCode) {
    LOG(WARNING) << "[APU] Build APU runtime failed!";
-    return subgraph::FAILED;
+    return false;
  }
  // Set input buffer
@@ -177,7 +179,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
  neuron_errCode = NeuronExecution_compute(run);
  if (NEURON_NO_ERROR != neuron_errCode) {
    LOG(WARNING) << "Fail to run execution!" << neuron_errCode;
-    return subgraph::FAILED;
+    return false;
  }
  for (size_t i = 0; i < origin_otensors_.size(); i++) {
@@ -190,7 +192,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
  }
  NeuronExecution_free(run);
  VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
-  return 0;
+  return true;
 }
 SubgraphEngine::~SubgraphEngine() {
@@ -211,12 +213,11 @@ void SubgraphCompute::PrepareForRun() {
                                   param.output_data_names,
                                   param.scope));
  CHECK(engine_);
-  engine_->Build();
 }
 void SubgraphCompute::Run() {
  CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 }  // namespace apu

--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
@@ -41,8 +41,8 @@ class SubgraphEngine : public subgraph::Engine {
  ~SubgraphEngine();
 protected:
-  int BuildDeviceProgram() override;
+  bool BuildDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
  NeuronModel *model_;
  NeuronCompilation *compilation_;

--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -28,12 +28,35 @@ namespace lite {
 namespace kernels {
 namespace bm {
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
+  // Obtain the origin input tensors, and create the origin output
+  // tensors(Don't try to access them before launch the device program or the
+  // origin program)
+  PrepareWorkspaceForOriginProgram();
+  // Create the device input and output tensors, but don't initialize them
+  // with the dimensions
+  device_inputs_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    device_inputs_[i].reset(new hiai::AiTensor);
+    CHECK(device_inputs_[i]);
+  }
+  device_outputs_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    device_outputs_[i].reset(new hiai::AiTensor);
+    CHECK(device_outputs_[i]);
+  }
+  return true;
+}
+bool SubgraphEngine::BuildDeviceProgram() {
  int status = 0;
  subgraph::bm::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
  graph.CreateCompilerHandle();
  auto& ctx = this->ctx_->template As<BMContext>();
+  if (origin_program_.empty()) {
+    BuildOriginProgram();
+  }
  for (auto& inst : origin_program_) {
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
@@ -42,7 +65,7 @@ int SubgraphEngine::BuildDeviceProgram() {
    std::string op_type = op->op_info()->Type();
    LOG(INFO) << op_type;
    if (!bridges.Exists(op_type, TARGET(kBM))) {
-      return subgraph::FAILED;
+      return false;
    }
    auto kernel = inst.kernel();
    status |=
@@ -50,7 +73,7 @@ int SubgraphEngine::BuildDeviceProgram() {
                                             const_cast<OpLite*>(op),
                                             const_cast<KernelBase*>(kernel));
    if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
    }
  }
  std::string net_name = "bmnetc_f32umodel";
@@ -63,7 +86,7 @@ int SubgraphEngine::BuildDeviceProgram() {
  graph.UnlockCompilerMutex();
  bmrt_hd_ = bmrt_create(bm_hd_);
  if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) {
-    return subgraph::FAILED;
+    return false;
  }
  bmrt_get_network_names(bmrt_hd_, &net_names_);
  net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]);
@@ -116,10 +139,10 @@ int SubgraphEngine::BuildDeviceProgram() {
                            net_info_->output_dtypes[i],
                            stage.output_shapes[i]);
  }
-  return status;
+  return true;
 }
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
  for (size_t i = 0; i < device_inputs_.size(); i++) {
    bm_memcpy_s2d(bm_hd_,
                  device_inputs_[i].device_mem,
@@ -143,7 +166,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
      out_index++;
    }
  }
-  return 0;
+  return true;
 }
 void SubgraphCompute::PrepareForRun() {
@@ -155,12 +178,11 @@ void SubgraphCompute::PrepareForRun() {
                                   param.output_data_names,
                                   param.scope));
  CHECK(engine_);
-  engine_->Build();
 }
 void SubgraphCompute::Run() {
  CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 }  // namespace bm

--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
@@ -44,8 +44,9 @@ class SubgraphEngine : public subgraph::Engine {
            ctx, block_idx, block_desc, input_names, output_names, scope) {}
 protected:
-  int BuildDeviceProgram() override;
+  bool PrepareWorkspaceForDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
 private:
  void *bmrt_hd_;

--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -62,32 +62,6 @@ class SubgraphEngine : public subgraph::Engine {
    }
  }
-  int Build() {
-    // In order to attach all of the ops of the block desc, we need to build
-    // the original program firstly.
-    BuildOriginProgram();
-    // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
-    build_device_program_status_ = BuildDeviceProgram();
-    return build_device_program_status_;
-  }
-  int Launch() {
-    // Rebuild device program when the shapes of input tensors have been
-    // changed.
-    if (subgraph::CHECK_SUCCESS(build_device_program_status_) &&
-        subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED(
-            build_device_program_status_) &&
-        InputShapeChanged()) {
-      Build();
-    }
-    if (subgraph::CHECK_FAILED(build_device_program_status_)) {
-      LaunchOriginProgram();
-    } else {
-      LaunchDeviceProgram();
-    }
-    return 0;
-  }
  bool InputShapeChanged() {
    std::vector<std::vector<int64_t>> new_shape;
    // used in batch changable situation
@@ -127,7 +101,10 @@ class SubgraphEngine : public subgraph::Engine {
  }
 protected:
-  int BuildDeviceProgram() override {
+  bool BuildDeviceProgram() override {
+    if (origin_program_.empty()) {
+      BuildOriginProgram();
+    }
    if (!error_compile_batch_size_changeable_ &&
        !disable_batch_size_changeable_) {
      int status = BuildDeviceProgramImpl();
@@ -142,7 +119,7 @@ class SubgraphEngine : public subgraph::Engine {
    return BuildDeviceProgramImpl();
  }
-  int BuildDeviceProgramImpl() {
+  bool BuildDeviceProgramImpl() {
    int status = 0;
    auto graph = std::make_shared<paddle::lite::subgraph::mlu::Graph>();
    graph->SetFPType(fp_type_);
@@ -197,13 +174,16 @@ class SubgraphEngine : public subgraph::Engine {
        status |= subgraph::FAILED;
        VLOG(4) << "[MLU] found unsupported batch_size changeable op type: "
                << op_type;
-        return status;
+        if (subgraph::CHECK_FAILED(status)) {
+          return false;
+        }
+        return true;
      }
      op->CheckShape();
      const_cast<OpLite*>(op)->InferShape();
      if (!bridges.Exists(op_type, TARGET(kMLU))) {
        LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
-        return subgraph::FAILED;
+        return false;
      }
      auto kernel = inst.kernel();
      status |= bridges.Select(op_type, TARGET(kMLU))(
@@ -211,7 +191,7 @@ class SubgraphEngine : public subgraph::Engine {
          const_cast<OpLite*>(op),
          const_cast<KernelBase*>(kernel));
      if (subgraph::CHECK_FAILED(status)) {
-        return subgraph::FAILED;
+        return false;
      }
    }
    // Obtain the output nodes of the MLU IR graph and build the graph to MLU
@@ -242,7 +222,7 @@ class SubgraphEngine : public subgraph::Engine {
    if (GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL")) {
      graph->GenOfflineModel(GetOfflineModName());
    }
-    return status;
+    return true;
  }
  std::string TrimStrings(const std::string& origin_str) {
@@ -329,7 +309,7 @@ class SubgraphEngine : public subgraph::Engine {
    }
  }
-  int LaunchDeviceProgram() override {
+  bool LaunchDeviceProgram() override {
    // prepare input and output memory
    auto& mlu_context = this->ctx_->template As<MLUContext>();
    auto exec_queue = mlu_context.exec_queue();
@@ -453,7 +433,7 @@ class SubgraphEngine : public subgraph::Engine {
      // =========== DUMP END ================
    }
-    return 0;
+    return true;
  }
  paddle::lite_api::PrecisionType fp_type_;
@@ -501,12 +481,11 @@ class SubgraphCompute
                                                param.scope,
                                                this->precision()));
    CHECK(engine_);
-    engine_->Build();
  }
  void Run() override {
    CHECK(engine_);
-    engine_->Launch();
+    engine_->Run();
  }
  virtual ~SubgraphCompute() = default;

--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
@@ -28,13 +28,36 @@ namespace lite {
 namespace kernels {
 namespace rknpu {
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
+  // Obtain the origin input tensors, and create the origin output
+  // tensors(Don't try to access them before launch the device program or the
+  // origin program)
+  PrepareWorkspaceForOriginProgram();
+  // Create the device input and output tensors, but don't initialize them
+  // with the dimensions
+  device_itensors_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    device_itensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_itensors_[i]);
+  }
+  device_otensors_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    device_otensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_otensors_[i]);
+  }
+  return true;
+}
+bool SubgraphEngine::BuildDeviceProgram() {
  LOG(INFO) << "[RKNPU]:BuildDeviceProgram";
  int status = 0;
  // Convert all of ops and their input vars and weights and added into the NPU
  // RKNPU IR graph
  subgraph::rknpu::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
+  if (origin_program_.empty()) {
+    BuildOriginProgram();
+  }
  for (auto& inst : origin_program_) {
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
@@ -42,13 +65,13 @@ int SubgraphEngine::BuildDeviceProgram() {
    op->InferShape();
    std::string op_type = op->op_info()->Type();
    if (!bridges.Exists(op_type, TARGET(kRKNPU))) {
-      return subgraph::FAILED;
+      return false;
    }
    auto kernel = inst.kernel();
    status |= bridges.Select(op_type, TARGET(kRKNPU))(
        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
    if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
    }
  }
  // Collect the valid input and output nodes in the RKNPU IR graph and update
@@ -91,7 +114,7 @@ int SubgraphEngine::BuildDeviceProgram() {
      model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
  if (device_program_ == nullptr) {
    LOG(WARNING) << "[RKNPU] Build model failed!";
-    return subgraph::FAILED;
+    return false;
  }
  // input
@@ -165,10 +188,10 @@ int SubgraphEngine::BuildDeviceProgram() {
        break;
    }
  }
-  return status;
+  return true;
 }
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
  LOG(INFO) << "[RKNPU]:LaunchDeviceProgram";
  std::vector<rk::nn::InputInfo> inputs;
  std::vector<rk::nn::OutputInfo> outputs;
@@ -195,7 +218,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
  device_program_->SetInputs(inputs);
  device_program_->Run();
  device_program_->GetOutputs(outputs);
-  return 0;
+  return true;
 }
 void SubgraphCompute::PrepareForRun() {
@@ -208,13 +231,12 @@ void SubgraphCompute::PrepareForRun() {
                                   param.output_data_names,
                                   param.scope));
  CHECK(engine_);
-  engine_->Build();
 }
 void SubgraphCompute::Run() {
  LOG(INFO) << "[RKNPU]:Run";
  CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 }  // namespace rknpu

--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
@@ -42,14 +42,15 @@ class SubgraphEngine : public subgraph::Engine {
            ctx, block_idx, block_desc, input_names, output_names, scope) {}
 protected:
-  int BuildDeviceProgram() override;
+  bool PrepareWorkspaceForDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
  std::string model_name_;
  std::vector<std::string> device_inames_;
  std::vector<std::string> device_onames_;
-  std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_{};
-  std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_{};
  std::unique_ptr<rk::nn::Exection> device_program_{nullptr};
 };

--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -27,12 +27,35 @@ namespace lite {
 namespace kernels {
 namespace xpu {
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
+  // Obtain the origin input tensors, and create the origin output
+  // tensors(Don't try to access them before launch the device program or the
+  // origin program)
+  PrepareWorkspaceForOriginProgram();
+  // Create the device input and output tensors, but don't initialize them
+  // with the dimensions
+  device_itensors_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    device_itensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_itensors_[i]);
+  }
+  device_otensors_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    device_otensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_otensors_[i]);
+  }
+  return true;
+}
+bool SubgraphEngine::BuildDeviceProgram() {
  int status = 0;
  // Convert all of ops and their input vars and weights and added into the XPU
  // IR graph
  subgraph::xpu::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
+  if (origin_program_.empty()) {
+    BuildOriginProgram();
+  }
  for (auto& inst : origin_program_) {
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
@@ -40,13 +63,13 @@ int SubgraphEngine::BuildDeviceProgram() {
    op->InferShape();
    std::string op_type = op->op_info()->Type();
    if (!bridges.Exists(op_type, TARGET(kXPU))) {
-      return subgraph::FAILED;
+      return false;
    }
    auto kernel = inst.kernel();
    status |= bridges.Select(op_type, TARGET(kXPU))(
        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
    if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
    }
  }
  // Obtain the output nodes of the XPU IR graph and build the graph to the XPU
@@ -86,7 +109,7 @@ int SubgraphEngine::BuildDeviceProgram() {
      &graph.builder_, &graph.params_, &device_onodes);
  if (device_program_ == nullptr) {
    LOG(WARNING) << "[XPU] Build model failed!";
-    return subgraph::FAILED;
+    return false;
  }
  // Query and check the dimensions of input and output tensors
@@ -166,10 +189,10 @@ int SubgraphEngine::BuildDeviceProgram() {
    device_otensors_[i].strides = nullptr;
    device_otensors_[i].byte_offset = 0;
  }
-  return status;
+  return true;
 }
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
  for (size_t i = 0; i < device_itensors_.size(); i++) {
    // Update the data pointer of DLTensor to track the origin input tensors
    device_itensors_[i].data =
@@ -191,7 +214,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
        const_cast<void*>(origin_otensors_[i]->raw_data());
    device_program_->CopyOutputTo(i, &device_otensors_[i]);
  }
-  return 0;
+  return true;
 }
 void SubgraphCompute::PrepareForRun() {
@@ -203,12 +226,11 @@ void SubgraphCompute::PrepareForRun() {
                                   param.output_data_names,
                                   param.scope));
  CHECK(engine_);
-  engine_->Build();
 }
 void SubgraphCompute::Run() {
  CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 }  // namespace xpu

--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -39,13 +39,14 @@ class SubgraphEngine : public subgraph::Engine {
            ctx, block_idx, block_desc, input_names, output_names, scope) {}
 protected:
-  int BuildDeviceProgram() override;
+  bool PrepareWorkspaceForDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
  std::vector<std::string> device_inames_;
  std::vector<std::string> device_onames_;
-  std::vector<DLTensor> device_itensors_;
+  std::vector<DLTensor> device_itensors_{};
-  std::vector<DLTensor> device_otensors_;
+  std::vector<DLTensor> device_otensors_{};
  std::unique_ptr<xtcl::network::xRuntimeInstance> device_program_{nullptr};
 };