From 0d28ee29066ba46b48198d62605f6b61fcf92719 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 21 Apr 2022 11:17:08 +0800
Subject: [PATCH] infer add io stream. (#42031)

* infer add io stream.

* add macro
---
 cmake/external/lite.cmake                     |   2 +-
 .../fluid/inference/api/analysis_predictor.cc |  18 +++
 .../fluid/inference/api/analysis_predictor.h  |   4 +
 .../inference/api/details/zero_copy_tensor.cc | 133 ++++++++++++++++++
 paddle/fluid/inference/api/paddle_api.h       |  12 ++
 paddle/fluid/inference/api/paddle_tensor.h    |   5 +
 6 files changed, 173 insertions(+), 1 deletion(-)
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index f1d206dd5e..0031757467 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -50,7 +50,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG 4ab64daecc11fbf74fffdc6a4733f388472e7d5d)
+    set(LITE_GIT_TAG 81ef66554099800c143a0feff6e0a491b3b0d12e)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 7ec3271c66..015f4471a0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1931,11 +1931,29 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
 #endif
   return false;
 }
+
 void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,
                                             bool with_interleaved) {
 #ifdef PADDLE_WITH_CUDA
   c->trt_with_interleaved_ = with_interleaved;
 #endif
 }
+
+void InternalUtils::SyncStream(paddle_infer::Predictor *p) {
+#ifdef PADDLE_WITH_CUDA
+  auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
+  paddle::platform::DeviceContextPool &pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>(
+      pool.Get(pred->place_));
+  cudaStreamSynchronize(dev_ctx->stream());
+#endif
+}
+void InternalUtils::SyncStream(cudaStream_t stream) {
+#ifdef PADDLE_WITH_CUDA
+  cudaStreamSynchronize(stream);
+#endif
+}
+
 }  // namespace experimental
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index d9992f3fbe..e96526730f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -38,6 +38,9 @@
 
 namespace paddle_infer {
 using float16 = paddle::platform::float16;
+namespace experimental {
+class InternalUtils;
+};
 }
 ///
 /// \file analysis_predictor.h
@@ -492,6 +495,7 @@ class AnalysisPredictor : public PaddlePredictor {
   std::shared_ptr<distributed::FleetExecutor> fleet_exe_;
   std::shared_ptr<distributed::TaskNode> task_node_;
 #endif
+  friend class paddle_infer::experimental::InternalUtils;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 0f26a1076a..7461724afb 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -714,4 +714,137 @@ template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
 template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
 #endif
 
+namespace experimental {
+template <typename T>
+void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
+                                            const T *data,
+                                            cudaStream_t stream) {
+  if (t->tensor_ == nullptr) {
+    PADDLE_ENFORCE_EQ(
+        t->name_.empty(), false,
+        paddle::platform::errors::PreconditionNotMet(
+            "Need to SetName first, so that the corresponding tensor can "
+            "be retrieved."));
+    auto *scope = static_cast<paddle::framework::Scope *>(t->scope_);
+    auto *var = scope->FindVar(t->name_);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, paddle::platform::errors::PreconditionNotMet(
+                 "No tensor called [%s] in the runtime scope", t->name_));
+    auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
+    t->tensor_ = tensor;
+  }
+
+  auto *tensor = static_cast<paddle::framework::LoDTensor *>(t->tensor_);
+  PADDLE_ENFORCE_GE(tensor->numel(), 0,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "You should call Tensor::Reshape(const "
+                        "std::vector<int> &shape)"
+                        "function before copying data from cpu."));
+  size_t ele_size = tensor->numel() * sizeof(T);
+  if (t->place_ == PlaceType::kCPU) {
+    auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
+    std::memcpy(static_cast<void *>(t_data), data, ele_size);
+  } else if (t->place_ == PlaceType::kGPU) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    paddle::platform::CUDAPlace gpu_place(t->device_);
+    auto *t_data = tensor->mutable_data<T>(gpu_place);
+    paddle::memory::Copy(gpu_place, static_cast<void *>(t_data),
+                         paddle::platform::CPUPlace(), data, ele_size, stream);
+#else
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "Can not create tensor with CUDA place because paddle is not compiled "
+        "with CUDA."));
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "CopyFromCpuWithIoStream only supports CPU and GPU now."));
+  }
+}
+
+template <typename T>
+void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data,
+                                          cudaStream_t stream) {
+  if (t->tensor_ == nullptr) {
+    PADDLE_ENFORCE_EQ(
+        t->name_.empty(), false,
+        paddle::platform::errors::PreconditionNotMet(
+            "Need to SetName first, so that the corresponding tensor can "
+            "be retrieved."));
+    auto *scope = static_cast<paddle::framework::Scope *>(t->scope_);
+    auto *var = scope->FindVar(t->name_);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, paddle::platform::errors::PreconditionNotMet(
+                 "No tensor called [%s] in the runtime scope", t->name_));
+    auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
+    t->tensor_ = tensor;
+  }
+
+  auto *tensor = static_cast<paddle::framework::LoDTensor *>(t->tensor_);
+  auto ele_num = tensor->numel();
+  auto *t_data = tensor->data<T>();
+  auto t_place = tensor->place();
+
+  paddle::framework::Tensor out;
+  auto mem_allocation =
+      std::make_shared<paddle::memory::allocation::Allocation>(
+          static_cast<void *>(data), ele_num * sizeof(T),
+          paddle::platform::CPUPlace());
+  out.ResetHolder(mem_allocation);
+
+  if (paddle::platform::is_cpu_place(t_place)) {
+#ifdef PADDLE_WITH_MKLDNN
+    if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN)
+      paddle::framework::innerTransDataLayoutFromMKLDNN(
+          tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls()
+                                .get_cur_paddle_data_layout(),
+          *tensor, &out, paddle::platform::CPUPlace(), true);
+    else
+      std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#else
+    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#endif
+  } else if (t->place_ == PlaceType::kGPU) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                         static_cast<void *>(data), t_place, t_data,
+                         ele_num * sizeof(T), stream);
+#else
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "Can not create tensor with CUDA place because paddle is not compiled "
+        "with CUDA."));
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "CopyToCpuWithIoStream only supports CPU and GPU now."));
+  }
+}
+
+template void InternalUtils::CopyFromCpuWithIoStream<float>(
+    paddle_infer::Tensor *t, const float *data, cudaStream_t stream);
+template void InternalUtils::CopyFromCpuWithIoStream<int64_t>(
+    paddle_infer::Tensor *t, const int64_t *data, cudaStream_t stream);
+template void InternalUtils::CopyFromCpuWithIoStream<int32_t>(
+    paddle_infer::Tensor *t, const int32_t *data, cudaStream_t stream);
+template void InternalUtils::CopyFromCpuWithIoStream<uint8_t>(
+    paddle_infer::Tensor *t, const uint8_t *data, cudaStream_t stream);
+template void InternalUtils::CopyFromCpuWithIoStream<int8_t>(
+    paddle_infer::Tensor *t, const int8_t *data, cudaStream_t stream);
+template void InternalUtils::CopyFromCpuWithIoStream<float16>(
+    paddle_infer::Tensor *t, const float16 *data, cudaStream_t stream);
+
+template void InternalUtils::CopyToCpuWithIoStream<float>(
+    paddle_infer::Tensor *t, float *data, cudaStream_t stream);
+template void InternalUtils::CopyToCpuWithIoStream<int64_t>(
+    paddle_infer::Tensor *t, int64_t *data, cudaStream_t stream);
+template void InternalUtils::CopyToCpuWithIoStream<int32_t>(
+    paddle_infer::Tensor *t, int32_t *data, cudaStream_t stream);
+template void InternalUtils::CopyToCpuWithIoStream<uint8_t>(
+    paddle_infer::Tensor *t, uint8_t *data, cudaStream_t stream);
+template void InternalUtils::CopyToCpuWithIoStream<int8_t>(
+    paddle_infer::Tensor *t, int8_t *data, cudaStream_t stream);
+template void InternalUtils::CopyToCpuWithIoStream<float16>(
+    paddle_infer::Tensor *t, float16 *data, cudaStream_t stream);
+
+}  // namespace experimental
+
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 0f8f9e0a97..dc9f7debe5 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -420,8 +420,10 @@ using hipStream_t = struct ihipStream_t*;
 
 namespace paddle_infer {
 class Predictor;
+class Tensor;
 using Config = paddle::AnalysisConfig;
 namespace experimental {
+// Unstable interface, may be modified or deleted in the future.
 class PD_INFER_DECL InternalUtils {
  public:
   // Note: Can only be used under thread_local semantics.
@@ -429,8 +431,18 @@ class PD_INFER_DECL InternalUtils {
                                     cudaStream_t stream);
   static bool RunWithExternalStream(paddle_infer::Predictor* pred,
                                     hipStream_t stream);
+
   static void UpdateConfigInterleaved(paddle_infer::Config* c,
                                       bool with_interleaved);
+
+  static void SyncStream(paddle_infer::Predictor* pred);
+  static void SyncStream(cudaStream_t stream);
+  template <typename T>
+  static void CopyFromCpuWithIoStream(paddle_infer::Tensor* t, const T* data,
+                                      cudaStream_t stream);
+  template <typename T>
+  static void CopyToCpuWithIoStream(paddle_infer::Tensor* t, T* data,
+                                    cudaStream_t stream);
 };
 }  // namespace experimental
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 2afe2d32e2..6f99ed6e25 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -39,6 +39,10 @@ namespace contrib {
 class TensorUtils;
 }
 
+namespace experimental {
+class InternalUtils;
+};
+
 /// \brief Paddle data type.
 enum DataType {
   FLOAT32,
@@ -198,6 +202,7 @@ class PD_INFER_DECL Tensor {
 #endif
 
   friend class paddle_infer::contrib::TensorUtils;
+  friend class paddle_infer::experimental::InternalUtils;
 #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
   friend class paddle_infer::InferApiTesterUtils;
 #endif
-- 
GitLab