[NPU] share buffer between lite tensor and npu tensor (#3606)

b2142d4a · zhupengyang · GitHub · 620b801b · b2142d4a · b2142d4a
隐藏空白更改
内联并排

Showing with 27 addition and 14 deletion

lite/core/mir/subgraph/subgraph_pass_test.cc lite/core/mir/subgraph/subgraph_pass_test.cc +1 -0

lite/kernels/npu/subgraph_compute.cc lite/kernels/npu/subgraph_compute.cc +26 -14

未找到文件。
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -139,6 +139,7 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
    predictor->Run();
  }
  for (int i = 0; i < FLAGS_repeats; i++) {
+    FillInputTensors(predictor, input_tensor_shape, input_tensor_type, i);
    auto start = GetCurrentUS();
    predictor->Run();
    LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";

--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -212,12 +212,6 @@ int SubgraphEngine::LaunchDeviceProgram() {
           hiai::AI_SUCCESS);
  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";

-  // Copy the data of output HiAI tensor to the buffer of origin output tensors
-  for (size_t i = 0; i < device_otensors_.size(); i++) {
-    std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
-                device_otensors_[i]->GetBuffer(),
-                device_otensors_[i]->GetSize());
-  }
  return 0;
 }

@@ -236,16 +230,34 @@ int SubgraphEngine::Build() {
 void SubgraphEngine::InitDeviceTensor() {
  auto device_program = device_program_map_[inputs_shape_];
  for (size_t i = 0; i < device_itensors_.size(); i++) {
-    device_itensors_[i]->Init(&(device_program->device_idims[i]));
-    std::memcpy(device_itensors_[i]->GetBuffer(),
-                origin_itensors_[i]->raw_data(),
-                origin_itensors_[i]->memory_size());
+    if (device_itensors_[i]->GetBuffer() != origin_itensors_[i]->raw_data()) {
+      VLOG(3) << "init device_itensors and share input tensor buf between "
+                 "device and host";
+      device_itensors_[i]->Init(&(device_program->device_idims[i]));
+      std::memcpy(device_itensors_[i]->GetBuffer(),
+                  origin_itensors_[i]->raw_data(),
+                  origin_itensors_[i]->memory_size());
+      // share data buf between device_itensor and origin_itensor
+      std::shared_ptr<Buffer> buffer =
+          std::make_shared<Buffer>(device_itensors_[i]->GetBuffer(),
+                                   lite_api::TargetType::kHost,
+                                   device_itensors_[i]->GetSize());
+      origin_itensors_[i]->ResetBuffer(buffer, device_itensors_[i]->GetSize());
+    }
  }
  for (size_t i = 0; i < device_otensors_.size(); i++) {
-    device_otensors_[i]->Init(&(device_program->device_odims[i]));
-  }
-  for (size_t i = 0; i < origin_otensors_.size(); i++) {
-    origin_otensors_[i]->Resize(device_program->origin_odims[i]);
+    if (device_otensors_[i]->GetBuffer() != origin_otensors_[i]->raw_data()) {
+      VLOG(3) << "init device_otensors and share output tensor buf between "
+                 "device and host";
+      device_otensors_[i]->Init(&(device_program->device_odims[i]));
+      // share data buf between device_itensor and origin_itensor
+      origin_otensors_[i]->Resize(device_program->origin_odims[i]);
+      std::shared_ptr<Buffer> buffer =
+          std::make_shared<Buffer>(device_otensors_[i]->GetBuffer(),
+                                   lite_api::TargetType::kHost,
+                                   device_otensors_[i]->GetSize());
+      origin_otensors_[i]->ResetBuffer(buffer, device_otensors_[i]->GetSize());
+    }
  }
 }