From 0d28ee29066ba46b48198d62605f6b61fcf92719 Mon Sep 17 00:00:00 2001 From: Wilber Date: Thu, 21 Apr 2022 11:17:08 +0800 Subject: [PATCH] infer add io stream. (#42031) * infer add io stream. * add macro --- cmake/external/lite.cmake | 2 +- .../fluid/inference/api/analysis_predictor.cc | 18 +++ .../fluid/inference/api/analysis_predictor.h | 4 + .../inference/api/details/zero_copy_tensor.cc | 133 ++++++++++++++++++ paddle/fluid/inference/api/paddle_api.h | 12 ++ paddle/fluid/inference/api/paddle_tensor.h | 5 + 6 files changed, 173 insertions(+), 1 deletion(-) diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index f1d206dd5e..0031757467 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -50,7 +50,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite) if(NOT LITE_GIT_TAG) - set(LITE_GIT_TAG 4ab64daecc11fbf74fffdc6a4733f388472e7d5d) + set(LITE_GIT_TAG 81ef66554099800c143a0feff6e0a491b3b0d12e) endif() if(NOT CUDA_ARCH_NAME) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 7ec3271c66..015f4471a0 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1931,11 +1931,29 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p, #endif return false; } + void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c, bool with_interleaved) { #ifdef PADDLE_WITH_CUDA c->trt_with_interleaved_ = with_interleaved; #endif } + +void InternalUtils::SyncStream(paddle_infer::Predictor *p) { +#ifdef PADDLE_WITH_CUDA + auto *pred = dynamic_cast(p->predictor_.get()); + paddle::platform::DeviceContextPool &pool = + paddle::platform::DeviceContextPool::Instance(); + auto *dev_ctx = reinterpret_cast( + pool.Get(pred->place_)); + cudaStreamSynchronize(dev_ctx->stream()); +#endif +} +void InternalUtils::SyncStream(cudaStream_t stream) { +#ifdef PADDLE_WITH_CUDA + cudaStreamSynchronize(stream); +#endif +} + } // namespace experimental } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index d9992f3fbe..e96526730f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -38,6 +38,9 @@ namespace paddle_infer { using float16 = paddle::platform::float16; +namespace experimental { +class InternalUtils; +}; } /// /// \file analysis_predictor.h @@ -492,6 +495,7 @@ class AnalysisPredictor : public PaddlePredictor { std::shared_ptr fleet_exe_; std::shared_ptr task_node_; #endif + friend class paddle_infer::experimental::InternalUtils; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 0f26a1076a..7461724afb 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -714,4 +714,137 @@ template void Tensor::ORTCopyToCpu(int8_t *data) const; template void Tensor::ORTCopyToCpu(float16 *data) const; #endif +namespace experimental { +template +void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t, + const T *data, + cudaStream_t stream) { + if (t->tensor_ == nullptr) { + PADDLE_ENFORCE_EQ( + t->name_.empty(), false, + paddle::platform::errors::PreconditionNotMet( + "Need to SetName first, so that the corresponding tensor can " + "be retrieved.")); + auto *scope = static_cast(t->scope_); + auto *var = scope->FindVar(t->name_); + PADDLE_ENFORCE_NOT_NULL( + var, paddle::platform::errors::PreconditionNotMet( + "No tensor called [%s] in the runtime scope", t->name_)); + auto *tensor = var->GetMutable(); + t->tensor_ = tensor; + } + + auto *tensor = static_cast(t->tensor_); + PADDLE_ENFORCE_GE(tensor->numel(), 0, + paddle::platform::errors::PreconditionNotMet( + "You should call Tensor::Reshape(const " + "std::vector &shape)" + "function before copying data from cpu.")); + size_t ele_size = tensor->numel() * sizeof(T); + if (t->place_ == PlaceType::kCPU) { + auto *t_data = tensor->mutable_data(paddle::platform::CPUPlace()); + std::memcpy(static_cast(t_data), data, ele_size); + } else if (t->place_ == PlaceType::kGPU) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + paddle::platform::CUDAPlace gpu_place(t->device_); + auto *t_data = tensor->mutable_data(gpu_place); + paddle::memory::Copy(gpu_place, static_cast(t_data), + paddle::platform::CPUPlace(), data, ele_size, stream); +#else + PADDLE_THROW(paddle::platform::errors::Unavailable( + "Can not create tensor with CUDA place because paddle is not compiled " + "with CUDA.")); +#endif + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "CopyFromCpuWithIoStream only supports CPU and GPU now.")); + } +} + +template +void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data, + cudaStream_t stream) { + if (t->tensor_ == nullptr) { + PADDLE_ENFORCE_EQ( + t->name_.empty(), false, + paddle::platform::errors::PreconditionNotMet( + "Need to SetName first, so that the corresponding tensor can " + "be retrieved.")); + auto *scope = static_cast(t->scope_); + auto *var = scope->FindVar(t->name_); + PADDLE_ENFORCE_NOT_NULL( + var, paddle::platform::errors::PreconditionNotMet( + "No tensor called [%s] in the runtime scope", t->name_)); + auto *tensor = var->GetMutable(); + t->tensor_ = tensor; + } + + auto *tensor = static_cast(t->tensor_); + auto ele_num = tensor->numel(); + auto *t_data = tensor->data(); + auto t_place = tensor->place(); + + paddle::framework::Tensor out; + auto mem_allocation = + std::make_shared( + static_cast(data), ele_num * sizeof(T), + paddle::platform::CPUPlace()); + out.ResetHolder(mem_allocation); + + if (paddle::platform::is_cpu_place(t_place)) { +#ifdef PADDLE_WITH_MKLDNN + if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN) + paddle::framework::innerTransDataLayoutFromMKLDNN( + tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout(), + *tensor, &out, paddle::platform::CPUPlace(), true); + else + std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); +#else + std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); +#endif + } else if (t->place_ == PlaceType::kGPU) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + paddle::memory::Copy(paddle::platform::CPUPlace(), + static_cast(data), t_place, t_data, + ele_num * sizeof(T), stream); +#else + PADDLE_THROW(paddle::platform::errors::Unavailable( + "Can not create tensor with CUDA place because paddle is not compiled " + "with CUDA.")); +#endif + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "CopyToCpuWithIoStream only supports CPU and GPU now.")); + } +} + +template void InternalUtils::CopyFromCpuWithIoStream( + paddle_infer::Tensor *t, const float *data, cudaStream_t stream); +template void InternalUtils::CopyFromCpuWithIoStream( + paddle_infer::Tensor *t, const int64_t *data, cudaStream_t stream); +template void InternalUtils::CopyFromCpuWithIoStream( + paddle_infer::Tensor *t, const int32_t *data, cudaStream_t stream); +template void InternalUtils::CopyFromCpuWithIoStream( + paddle_infer::Tensor *t, const uint8_t *data, cudaStream_t stream); +template void InternalUtils::CopyFromCpuWithIoStream( + paddle_infer::Tensor *t, const int8_t *data, cudaStream_t stream); +template void InternalUtils::CopyFromCpuWithIoStream( + paddle_infer::Tensor *t, const float16 *data, cudaStream_t stream); + +template void InternalUtils::CopyToCpuWithIoStream( + paddle_infer::Tensor *t, float *data, cudaStream_t stream); +template void InternalUtils::CopyToCpuWithIoStream( + paddle_infer::Tensor *t, int64_t *data, cudaStream_t stream); +template void InternalUtils::CopyToCpuWithIoStream( + paddle_infer::Tensor *t, int32_t *data, cudaStream_t stream); +template void InternalUtils::CopyToCpuWithIoStream( + paddle_infer::Tensor *t, uint8_t *data, cudaStream_t stream); +template void InternalUtils::CopyToCpuWithIoStream( + paddle_infer::Tensor *t, int8_t *data, cudaStream_t stream); +template void InternalUtils::CopyToCpuWithIoStream( + paddle_infer::Tensor *t, float16 *data, cudaStream_t stream); + +} // namespace experimental + } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 0f8f9e0a97..dc9f7debe5 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -420,8 +420,10 @@ using hipStream_t = struct ihipStream_t*; namespace paddle_infer { class Predictor; +class Tensor; using Config = paddle::AnalysisConfig; namespace experimental { +// Unstable interface, may be modified or deleted in the future. class PD_INFER_DECL InternalUtils { public: // Note: Can only be used under thread_local semantics. @@ -429,8 +431,18 @@ class PD_INFER_DECL InternalUtils { cudaStream_t stream); static bool RunWithExternalStream(paddle_infer::Predictor* pred, hipStream_t stream); + static void UpdateConfigInterleaved(paddle_infer::Config* c, bool with_interleaved); + + static void SyncStream(paddle_infer::Predictor* pred); + static void SyncStream(cudaStream_t stream); + template + static void CopyFromCpuWithIoStream(paddle_infer::Tensor* t, const T* data, + cudaStream_t stream); + template + static void CopyToCpuWithIoStream(paddle_infer::Tensor* t, T* data, + cudaStream_t stream); }; } // namespace experimental } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index 2afe2d32e2..6f99ed6e25 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -39,6 +39,10 @@ namespace contrib { class TensorUtils; } +namespace experimental { +class InternalUtils; +}; + /// \brief Paddle data type. enum DataType { FLOAT32, @@ -198,6 +202,7 @@ class PD_INFER_DECL Tensor { #endif friend class paddle_infer::contrib::TensorUtils; + friend class paddle_infer::experimental::InternalUtils; #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST) friend class paddle_infer::InferApiTesterUtils; #endif -- GitLab