From a66b29d714d46bf6471bfa21e37a6c8d02909dbb Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Tue, 12 May 2020 19:35:50 +0800
Subject: [PATCH] [NPU] share buffer between lite tensor and npu tensor 
 (#3606)

---
 lite/core/mir/subgraph/subgraph_pass_test.cc |  1 +
 lite/kernels/npu/subgraph_compute.cc         | 40 +++++++++++++-------
 2 files changed, 27 insertions(+), 14 deletions(-)
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index c638793c08..ee2d67e918 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -139,6 +139,7 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
     predictor->Run();
   }
   for (int i = 0; i < FLAGS_repeats; i++) {
+    FillInputTensors(predictor, input_tensor_shape, input_tensor_type, i);
     auto start = GetCurrentUS();
     predictor->Run();
     LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc
index 1a991bfc74..da2fd3ead2 100644
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -212,12 +212,6 @@ int SubgraphEngine::LaunchDeviceProgram() {
            hiai::AI_SUCCESS);
   VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
 
-  // Copy the data of output HiAI tensor to the buffer of origin output tensors
-  for (size_t i = 0; i < device_otensors_.size(); i++) {
-    std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
-                device_otensors_[i]->GetBuffer(),
-                device_otensors_[i]->GetSize());
-  }
   return 0;
 }
 
@@ -236,16 +230,34 @@ int SubgraphEngine::Build() {
 void SubgraphEngine::InitDeviceTensor() {
   auto device_program = device_program_map_[inputs_shape_];
   for (size_t i = 0; i < device_itensors_.size(); i++) {
-    device_itensors_[i]->Init(&(device_program->device_idims[i]));
-    std::memcpy(device_itensors_[i]->GetBuffer(),
-                origin_itensors_[i]->raw_data(),
-                origin_itensors_[i]->memory_size());
+    if (device_itensors_[i]->GetBuffer() != origin_itensors_[i]->raw_data()) {
+      VLOG(3) << "init device_itensors and share input tensor buf between "
+                 "device and host";
+      device_itensors_[i]->Init(&(device_program->device_idims[i]));
+      std::memcpy(device_itensors_[i]->GetBuffer(),
+                  origin_itensors_[i]->raw_data(),
+                  origin_itensors_[i]->memory_size());
+      // share data buf between device_itensor and origin_itensor
+      std::shared_ptr<Buffer> buffer =
+          std::make_shared<Buffer>(device_itensors_[i]->GetBuffer(),
+                                   lite_api::TargetType::kHost,
+                                   device_itensors_[i]->GetSize());
+      origin_itensors_[i]->ResetBuffer(buffer, device_itensors_[i]->GetSize());
+    }
   }
   for (size_t i = 0; i < device_otensors_.size(); i++) {
-    device_otensors_[i]->Init(&(device_program->device_odims[i]));
-  }
-  for (size_t i = 0; i < origin_otensors_.size(); i++) {
-    origin_otensors_[i]->Resize(device_program->origin_odims[i]);
+    if (device_otensors_[i]->GetBuffer() != origin_otensors_[i]->raw_data()) {
+      VLOG(3) << "init device_otensors and share output tensor buf between "
+                 "device and host";
+      device_otensors_[i]->Init(&(device_program->device_odims[i]));
+      // share data buf between device_itensor and origin_itensor
+      origin_otensors_[i]->Resize(device_program->origin_odims[i]);
+      std::shared_ptr<Buffer> buffer =
+          std::make_shared<Buffer>(device_otensors_[i]->GetBuffer(),
+                                   lite_api::TargetType::kHost,
+                                   device_otensors_[i]->GetSize());
+      origin_otensors_[i]->ResetBuffer(buffer, device_otensors_[i]->GetSize());
+    }
   }
 }
 
-- 
GitLab