From 35297bd8fa772af8356ff494cdcfdd4d1a4b7901 Mon Sep 17 00:00:00 2001 From: heliqi <1101791222@qq.com> Date: Tue, 2 Aug 2022 01:48:40 -0500 Subject: [PATCH] [cherry-pick]Ort backend optimizer(#44136 #44703 #44724) (#44766) * [Inference]ort backend optimizer (#44136) * add ort clone interface * paddle2onnx update to 1.0.0rc * ort input_tensor use mutable data of scope * clone ort_predictor reuse session (#44703) * ort backend support output mutable data (#44724) * 2.3 interface is different from the Develop interface * 2.3 interface is different from the Develop interface * 2.3 interface is different from the Develop interface --- cmake/external/paddle2onnx.cmake | 2 +- .../inference/api/details/zero_copy_tensor.cc | 250 ++++++++---------- .../inference/api/onnxruntime_predictor.cc | 187 ++++++++----- .../inference/api/onnxruntime_predictor.h | 53 +++- paddle/fluid/inference/api/paddle_tensor.h | 4 +- 5 files changed, 268 insertions(+), 228 deletions(-) diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake index cbb622f5cb9..b8a1b4548b8 100644 --- a/cmake/external/paddle2onnx.cmake +++ b/cmake/external/paddle2onnx.cmake @@ -24,7 +24,7 @@ endif() include(ExternalProject) set(PADDLE2ONNX_PROJECT "extern_paddle2onnx") -set(PADDLE2ONNX_VERSION "0.9.9") +set(PADDLE2ONNX_VERSION "1.0.0rc") set(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx) set(PADDLE2ONNX_SOURCE_DIR ${THIRD_PARTY_PATH}/paddle2onnx/src/${PADDLE2ONNX_PROJECT}) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 910256834b1..d68c49fe6b3 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -40,36 +40,42 @@ void Tensor::Reshape(const std::vector &shape) { #endif PADDLE_ENFORCE_EQ( - name_.empty(), false, + name_.empty(), + false, paddle::platform::errors::PreconditionNotMet( "Need to SetName first, so that the corresponding tensor can " "be retrieved.")); - PADDLE_ENFORCE_EQ(input_or_output_, true, + PADDLE_ENFORCE_EQ(input_or_output_, + true, paddle::platform::errors::PermissionDenied( "Can't reshape the output tensor, it is readonly")); auto *scope = static_cast(scope_); auto *var = scope->FindVar(name_); PADDLE_ENFORCE_NOT_NULL( - var, paddle::platform::errors::PreconditionNotMet( - "No tensor called [%s] in the runtime scope", name_)); + var, + paddle::platform::errors::PreconditionNotMet( + "No tensor called [%s] in the runtime scope", name_)); auto *tensor = var->GetMutable(); tensor->Resize(phi::make_ddim(shape)); } void Tensor::ReshapeStrings(const size_t &shape) { PADDLE_ENFORCE_EQ( - name_.empty(), false, + name_.empty(), + false, paddle::platform::errors::PreconditionNotMet( "Need to SetName first, so that the corresponding tensor can " "be retrieved.")); - PADDLE_ENFORCE_EQ(input_or_output_, true, + PADDLE_ENFORCE_EQ(input_or_output_, + true, paddle::platform::errors::PermissionDenied( "Can't reshape the output tensor, it is readonly")); auto *scope = static_cast(scope_); auto *var = scope->FindVar(name_); PADDLE_ENFORCE_NOT_NULL( - var, paddle::platform::errors::PreconditionNotMet( - "No tensor called [%s] in the runtime scope", name_)); + var, + paddle::platform::errors::PreconditionNotMet( + "No tensor called [%s] in the runtime scope", name_)); paddle_infer::Strings *tensor = var->GetMutable(); tensor->resize(shape); } @@ -82,9 +88,15 @@ void Tensor::ReshapeStrings(const size_t &shape) { template T *Tensor::mutable_data(PlaceType place) { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + return ORTGetMutableData(); + } +#endif EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GT( - tensor->numel(), 0, + tensor->numel(), + 0, paddle::platform::errors::PreconditionNotMet( "You should call Tensor::Reshape(const std::vector " "&shape)" @@ -161,15 +173,9 @@ PlaceType Tensor::place() const { return place_; } template void Tensor::CopyFromCpu(const T *data) { -#ifdef PADDLE_WITH_ONNXRUNTIME - if (is_ort_tensor_) { - ORTCopyFromCpu(data); - return; - } -#endif - EAGER_GET_TENSOR(paddle::framework::LoDTensor); - PADDLE_ENFORCE_GE(tensor->numel(), 0, + PADDLE_ENFORCE_GE(tensor->numel(), + 0, paddle::platform::errors::PreconditionNotMet( "You should call Tensor::Reshape(const " "std::vector &shape)" @@ -188,8 +194,11 @@ void Tensor::CopyFromCpu(const T *data) { auto *dev_ctx = static_cast( pool.Get(gpu_place)); - paddle::memory::Copy(gpu_place, static_cast(t_data), - paddle::platform::CPUPlace(), data, ele_size, + paddle::memory::Copy(gpu_place, + static_cast(t_data), + paddle::platform::CPUPlace(), + data, + ele_size, dev_ctx->stream()); #else PADDLE_THROW(paddle::platform::errors::Unavailable( @@ -200,8 +209,11 @@ void Tensor::CopyFromCpu(const T *data) { #ifdef PADDLE_WITH_XPU paddle::platform::XPUPlace xpu_place(device_); auto *t_data = tensor->mutable_data(xpu_place); - paddle::memory::Copy(xpu_place, static_cast(t_data), - paddle::platform::CPUPlace(), data, ele_size); + paddle::memory::Copy(xpu_place, + static_cast(t_data), + paddle::platform::CPUPlace(), + data, + ele_size); #else PADDLE_THROW(paddle::platform::errors::Unavailable( "Can not create tensor with XPU place because paddle is not compiled " @@ -215,8 +227,11 @@ void Tensor::CopyFromCpu(const T *data) { auto *t_data = tensor->mutable_data(npu_place); auto *dev_ctx = static_cast( pool.Get(npu_place)); - paddle::memory::Copy(npu_place, static_cast(t_data), - paddle::platform::CPUPlace(), data, ele_size, + paddle::memory::Copy(npu_place, + static_cast(t_data), + paddle::platform::CPUPlace(), + data, + ele_size, dev_ctx->stream()); #else PADDLE_THROW(paddle::platform::errors::Unavailable( @@ -264,30 +279,33 @@ struct DataTypeInfo { paddle::experimental::DataLayout LayoutConvert(DataLayout layout) { PADDLE_ENFORCE_EQ( - layout, DataLayout::kNCHW, + layout, + DataLayout::kNCHW, paddle::platform::errors::InvalidArgument("Only NCHW is supported now.")); return paddle::experimental::DataLayout::NCHW; } template -void Tensor::ShareExternalData(const T *data, const std::vector &shape, - PlaceType place, DataLayout layout) { +void Tensor::ShareExternalData(const T *data, + const std::vector &shape, + PlaceType place, + DataLayout layout) { EAGER_GET_TENSOR(paddle::framework::LoDTensor) size_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * sizeof(T); - phi::DenseTensorMeta meta(DataTypeInfo().TYPE, phi::make_ddim(shape), - LayoutConvert(layout)); + phi::DenseTensorMeta meta( + DataTypeInfo().TYPE, phi::make_ddim(shape), LayoutConvert(layout)); if (place == PlaceType::kCPU) { phi::DenseTensor dtensor( - std::make_shared(const_cast(data), size, - paddle::platform::CPUPlace()), + std::make_shared( + const_cast(data), size, paddle::platform::CPUPlace()), meta); *tensor = std::move(dtensor); } else if (place == PlaceType::kGPU) { phi::DenseTensor dtensor( - std::make_shared(const_cast(data), size, - paddle::platform::CUDAPlace(device_)), + std::make_shared( + const_cast(data), size, paddle::platform::CUDAPlace(device_)), meta); *tensor = std::move(dtensor); } else { @@ -298,7 +316,8 @@ void Tensor::ShareExternalData(const T *data, const std::vector &shape, void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { EAGER_GET_TENSOR(paddle_infer::Strings); - PADDLE_ENFORCE_GE(tensor->size(), 0, + PADDLE_ENFORCE_GE(tensor->size(), + 0, paddle::platform::errors::PreconditionNotMet( "You should call Tensor::Reshape(const " "std::size_t &shape)function before copying" @@ -307,7 +326,9 @@ void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { } template -void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, +void Tensor::CopyToCpuImpl(T *data, + void *exec_stream, + CallbackFunc cb, void *cb_params) const { EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto ele_num = tensor->numel(); @@ -317,7 +338,8 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, paddle::framework::Tensor out; auto mem_allocation = std::make_shared( - static_cast(data), ele_num * sizeof(T), + static_cast(data), + ele_num * sizeof(T), paddle::platform::CPUPlace()); out.ResetHolder(mem_allocation); @@ -325,9 +347,13 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, #ifdef PADDLE_WITH_MKLDNN if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN) paddle::framework::innerTransDataLayoutFromMKLDNN( - tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls() - .get_cur_paddle_data_layout(), - *tensor, &out, paddle::platform::CPUPlace(), true); + tensor->layout(), + paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout(), + *tensor, + &out, + paddle::platform::CPUPlace(), + true); else std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); #else @@ -349,8 +375,11 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, auto *dev_ctx = static_cast( pool.Get(gpu_place)); paddle::memory::Copy(paddle::platform::CPUPlace(), - static_cast(data), gpu_place, t_data, - ele_num * sizeof(T), dev_ctx->stream()); + static_cast(data), + gpu_place, + t_data, + ele_num * sizeof(T), + dev_ctx->stream()); #ifdef PADDLE_WITH_HIP hipStreamSynchronize(dev_ctx->stream()); #else @@ -374,7 +403,9 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, #ifdef PADDLE_WITH_XPU auto xpu_place = t_place; paddle::memory::Copy(paddle::platform::CPUPlace(), - static_cast(data), xpu_place, t_data, + static_cast(data), + xpu_place, + t_data, ele_num * sizeof(T)); #else PADDLE_THROW(paddle::platform::errors::Unavailable( @@ -389,8 +420,11 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, auto *dev_ctx = static_cast( pool.Get(npu_place)); paddle::memory::Copy(paddle::platform::CPUPlace(), - static_cast(data), npu_place, t_data, - ele_num * sizeof(T), dev_ctx->stream()); + static_cast(data), + npu_place, + t_data, + ele_num * sizeof(T), + dev_ctx->stream()); paddle::platform::NPUStreamSync(dev_ctx->stream()); #else PADDLE_THROW(paddle::platform::errors::Unavailable( @@ -433,22 +467,34 @@ template PD_INFER_DECL void Tensor::CopyFromCpu(const int8_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const float16 *data); template PD_INFER_DECL void Tensor::ShareExternalData( - const float *data, const std::vector &shape, PlaceType place, + const float *data, + const std::vector &shape, + PlaceType place, DataLayout layout); template PD_INFER_DECL void Tensor::ShareExternalData( - const int64_t *data, const std::vector &shape, PlaceType place, + const int64_t *data, + const std::vector &shape, + PlaceType place, DataLayout layout); template PD_INFER_DECL void Tensor::ShareExternalData( - const int32_t *data, const std::vector &shape, PlaceType place, + const int32_t *data, + const std::vector &shape, + PlaceType place, DataLayout layout); template PD_INFER_DECL void Tensor::ShareExternalData( - const uint8_t *data, const std::vector &shape, PlaceType place, + const uint8_t *data, + const std::vector &shape, + PlaceType place, DataLayout layout); template PD_INFER_DECL void Tensor::ShareExternalData( - const int8_t *data, const std::vector &shape, PlaceType place, + const int8_t *data, + const std::vector &shape, + PlaceType place, DataLayout layout); template PD_INFER_DECL void Tensor::ShareExternalData( - const float16 *data, const std::vector &shape, PlaceType place, + const float16 *data, + const std::vector &shape, + PlaceType place, DataLayout layout); template PD_INFER_DECL void Tensor::CopyToCpu(float *data) const; @@ -524,15 +570,17 @@ Tensor::Tensor(void *scope) : scope_{scope} {} template void *Tensor::FindTensor() const { PADDLE_ENFORCE_EQ( - name_.empty(), false, + name_.empty(), + false, paddle::platform::errors::PreconditionNotMet( "Need to SetName first, so that the corresponding tensor can " "be retrieved.")); auto *scope = static_cast(scope_); auto *var = scope->FindVar(name_); PADDLE_ENFORCE_NOT_NULL( - var, paddle::platform::errors::PreconditionNotMet( - "No tensor called [%s] in the runtime scope", name_)); + var, + paddle::platform::errors::PreconditionNotMet( + "No tensor called [%s] in the runtime scope", name_)); auto *tensor = var->GetMutable(); return tensor; } @@ -560,8 +608,9 @@ std::vector Tensor::shape() const { #endif EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_NOT_NULL( - tensor_, paddle::platform::errors::PreconditionNotMet( - "Not found tensor called %s in the scope", name_)); + tensor_, + paddle::platform::errors::PreconditionNotMet( + "Not found tensor called %s in the scope", name_)); // mkldnn may does layout transform internally, so need to reorder before // return #ifdef PADDLE_WITH_MKLDNN @@ -626,91 +675,15 @@ void Tensor::SetOrtBinding(const std::shared_ptr binding) { binding_ = binding; } -void Tensor::SetOrtBuffer(const std::shared_ptr> buffer) { - buffer_ = buffer; -} - -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float *data, - size_t size, const int64_t *shape, size_t shape_len) { - return Ort::Value::CreateTensor(memory_info, data, size, shape, - shape_len); -} - -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int64_t *data, - size_t size, const int64_t *shape, size_t shape_len) { - return Ort::Value::CreateTensor(memory_info, data, size, shape, - shape_len); -} - -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int32_t *data, - size_t size, const int64_t *shape, size_t shape_len) { - return Ort::Value::CreateTensor(memory_info, data, size, shape, - shape_len); -} - -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, uint8_t *data, - size_t size, const int64_t *shape, size_t shape_len) { - return Ort::Value::CreateTensor(memory_info, data, size, shape, - shape_len); -} - -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int8_t *data, - size_t size, const int64_t *shape, size_t shape_len) { - return Ort::Value::CreateTensor(memory_info, data, size, shape, - shape_len); -} - -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float16 *data, - size_t size, const int64_t *shape, size_t shape_len) { - return Ort::Value::CreateTensor(memory_info, static_cast(data), - size * sizeof(float16), shape, shape_len, - ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16); -} - template -void Tensor::ORTCopyFromCpu(const T *data) { +T *Tensor::ORTGetMutableData() { auto binding = binding_.lock(); PADDLE_ENFORCE_NOT_NULL(binding, paddle::platform::errors::PreconditionNotMet( - "input tensor [%s] no binding ptr", name_)); - const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda"; - Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, device_, - OrtMemTypeDefault); - size_t size = std::accumulate(begin(shape_), end(shape_), 1UL, - std::multiplies()); - auto buffer = buffer_.lock(); - size_t buffer_size = size * sizeof(T); - if (buffer_size > buffer->size()) { - buffer->resize(buffer_size); - } - std::memcpy(static_cast(buffer->data()), data, buffer_size); - - auto onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; - if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; - } else { - PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "Found undefined data type for onnxruntime, only supports " - "float16/float32/float64/int8/uint8/int32/int64.")); - } - - auto ort_value = - Ort::Value::CreateTensor(memory_info, buffer->data(), buffer_size, - shape_.data(), shape_.size(), onnx_dtype); - - binding->BindInput(name_.c_str(), ort_value); + "output tensor [%s] no binding ptr", name_)); + std::vector outputs = binding->GetOutputValues(); + Ort::Value &value = outputs[idx_]; + return value.GetTensorMutableData(); } template @@ -733,13 +706,6 @@ void Tensor::ORTCopyToCpu(T *data) const { } } -template void Tensor::ORTCopyFromCpu(const float *data); -template void Tensor::ORTCopyFromCpu(const int64_t *data); -template void Tensor::ORTCopyFromCpu(const int32_t *data); -template void Tensor::ORTCopyFromCpu(const uint8_t *data); -template void Tensor::ORTCopyFromCpu(const int8_t *data); -template void Tensor::ORTCopyFromCpu(const float16 *data); - template void Tensor::ORTCopyToCpu(float *data) const; template void Tensor::ORTCopyToCpu(int32_t *data) const; template void Tensor::ORTCopyToCpu(uint8_t *data) const; diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc index 98fd3267ace..87ef91a5326 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor.cc +++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc @@ -24,11 +24,10 @@ #include #include -#include "paddle/fluid//platform/device/gpu/gpu_types.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/inference/analysis/helper.h" -#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" @@ -71,22 +70,23 @@ bool CheckConvertToONNX(const AnalysisConfig &config) { } else if (config.prog_file().empty() || config.params_file().empty()) { LOG(ERROR) << string::Sprintf( "not valid model path '%s' or program path '%s' or params path '%s'.", - config.model_dir(), config.prog_file(), config.params_file()); + config.model_dir(), + config.prog_file(), + config.params_file()); return false; } if (config.model_from_memory()) { - return paddle2onnx::IsExportable( - config.prog_file().data(), config.prog_file().size(), - config.params_file().data(), config.params_file().size()); + return paddle2onnx::IsExportable(config.prog_file().data(), + config.prog_file().size(), + config.params_file().data(), + config.params_file().size()); } else { return paddle2onnx::IsExportable(config.prog_file().c_str(), config.params_file().c_str()); } } -bool ONNXRuntimePredictor::Init() { - VLOG(3) << "ONNXRuntime Predictor::init()"; - +bool ONNXRuntimePredictor::InitBinding() { // Now ONNXRuntime only support CPU const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu"; if (config_.use_gpu()) { @@ -94,16 +94,69 @@ bool ONNXRuntimePredictor::Init() { } else { place_ = paddle::platform::CPUPlace(); } + scope_.reset(new paddle::framework::Scope()); + + binding_ = std::make_shared(*session_); + Ort::MemoryInfo memory_info( + device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault); + Ort::Allocator allocator(*session_, memory_info); + + size_t n_inputs = session_->GetInputCount(); + framework::proto::VarType::Type proto_type = + framework::proto::VarType::LOD_TENSOR; + for (size_t i = 0; i < n_inputs; ++i) { + auto input_name = session_->GetInputName(i, allocator); + auto type_info = session_->GetInputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type}); + + auto *ptr = scope_->Var(input_name); + framework::InitializeVariable(ptr, proto_type); + + allocator.Free(input_name); + } + + size_t n_outputs = session_->GetOutputCount(); + for (size_t i = 0; i < n_outputs; ++i) { + auto output_name = session_->GetOutputName(i, allocator); + auto type_info = session_->GetOutputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type}); + + Ort::MemoryInfo out_memory_info(device_name, + OrtDeviceAllocator, + place_.GetDeviceId(), + OrtMemTypeDefault); + binding_->BindOutput(output_name, out_memory_info); + + allocator.Free(output_name); + } + return true; +} + +bool ONNXRuntimePredictor::Init() { + VLOG(3) << "ONNXRuntime Predictor::init()"; char *onnx_proto = nullptr; int out_size; if (config_.model_from_memory()) { - paddle2onnx::Export(config_.prog_file().data(), config_.prog_file().size(), + paddle2onnx::Export(config_.prog_file().data(), + config_.prog_file().size(), config_.params_file().data(), - config_.params_file().size(), &onnx_proto, &out_size); + config_.params_file().size(), + &onnx_proto, + &out_size); } else { paddle2onnx::Export(config_.prog_file().c_str(), - config_.params_file().c_str(), &onnx_proto, &out_size); + config_.params_file().c_str(), + &onnx_proto, + &out_size); } Ort::SessionOptions session_options; @@ -131,42 +184,11 @@ bool ONNXRuntimePredictor::Init() { "will be " "generated."; } - session_ = {env_, onnx_proto, static_cast(out_size), session_options}; - binding_ = std::make_shared(session_); + session_ = std::make_shared( + *env_, onnx_proto, static_cast(out_size), session_options); + InitBinding(); - Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, - place_.GetDeviceId(), OrtMemTypeDefault); - Ort::Allocator allocator(session_, memory_info); - - size_t n_inputs = session_.GetInputCount(); - for (size_t i = 0; i < n_inputs; ++i) { - auto input_name = session_.GetInputName(i, allocator); - auto type_info = session_.GetInputTypeInfo(i); - std::vector shape = - type_info.GetTensorTypeAndShapeInfo().GetShape(); - ONNXTensorElementDataType data_type = - type_info.GetTensorTypeAndShapeInfo().GetElementType(); - input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type}); - allocator.Free(input_name); - } - - size_t n_outputs = session_.GetOutputCount(); - for (size_t i = 0; i < n_outputs; ++i) { - auto output_name = session_.GetOutputName(i, allocator); - auto type_info = session_.GetOutputTypeInfo(i); - std::vector shape = - type_info.GetTensorTypeAndShapeInfo().GetShape(); - ONNXTensorElementDataType data_type = - type_info.GetTensorTypeAndShapeInfo().GetElementType(); - output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type}); - - Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, - place_.GetDeviceId(), OrtMemTypeDefault); - binding_->BindOutput(output_name, out_memory_info); - - allocator.Free(output_name); - } - delete onnx_proto; + delete[] onnx_proto; onnx_proto = nullptr; return true; } @@ -181,7 +203,8 @@ CreatePaddlePredictor( } PADDLE_ENFORCE_EQ( - config.is_valid(), true, + config.is_valid(), + true, platform::errors::InvalidArgument( "Note: Each config can only be used for one predictor.")); @@ -238,12 +261,13 @@ bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name, std::unique_ptr ONNXRuntimePredictor::GetInputTensor( const std::string &name) { - PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true, - platform::errors::PreconditionNotMet( - "The in variable named %s is not found in the " - "ONNXPredictor.", - name)); - std::unique_ptr res(new ZeroCopyTensor(nullptr)); + PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), + platform::errors::PreconditionNotMet( + "The in variable named %s is not found in the " + "ONNXPredictor.", + name)); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(scope_.get()))); res->input_or_output_ = true; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -252,22 +276,13 @@ std::unique_ptr ONNXRuntimePredictor::GetInputTensor( auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } - res->SetOrtMark(true); - res->SetOrtBinding(binding_); - auto iter = input_buffers_.find(name); - if (iter == input_buffers_.end()) { - std::vector i_vector; - input_buffers_[name] = std::make_shared>(i_vector); - res->SetOrtBuffer(input_buffers_[name]); - } else { - res->SetOrtBuffer(iter->second); - } return res; } std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( const std::string &name) { - PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true, + PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), + true, platform::errors::PreconditionNotMet( "The out variable named %s is not found in the " "ONNXPredictor.", @@ -293,6 +308,24 @@ std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( return res; } +Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc, + const char *device_name) { + Ort::MemoryInfo memory_info( + device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault); + auto *var = scope_->FindVar(desc.name); + auto *tensor = var->GetMutable(); + size_t size = + tensor->numel() * + framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype())); + std::vector shape = phi::vectorize(tensor->dims()); + return Ort::Value::CreateTensor(memory_info, + static_cast(tensor->data()), + size, + shape.data(), + shape.size(), + desc.dtype); +} + bool ONNXRuntimePredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { @@ -302,13 +335,21 @@ bool ONNXRuntimePredictor::Run(const std::vector &inputs, bool ONNXRuntimePredictor::ZeroCopyRun() { try { - const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda"; + const char *device_name = platform::is_cpu_place(place_) ? "Cpu" : "Cuda"; + std::vector inputs; + inputs.reserve(input_desc_.size()); + for (auto desc : input_desc_) { + inputs.push_back(GetOrtValue(desc, device_name)); + binding_->BindInput(desc.name.c_str(), inputs.back()); + } for (auto output : output_desc_) { - Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, - place_.GetDeviceId(), OrtMemTypeDefault); + Ort::MemoryInfo out_memory_info(device_name, + OrtDeviceAllocator, + place_.GetDeviceId(), + OrtMemTypeDefault); binding_->BindOutput(output.name.c_str(), out_memory_info); } - session_.Run({}, *(binding_.get())); + session_->Run({}, *(binding_.get())); } catch (const std::exception &e) { LOG(ERROR) << e.what(); return false; @@ -318,8 +359,10 @@ bool ONNXRuntimePredictor::ZeroCopyRun() { } std::unique_ptr ONNXRuntimePredictor::Clone() { - LOG(ERROR) << "Not support Clone(), Please create new Predictor"; - return nullptr; + std::lock_guard lk(clone_mutex_); + auto *x = new ONNXRuntimePredictor(config_, env_, session_); + x->InitBinding(); + return std::unique_ptr(x); } uint64_t ONNXRuntimePredictor::TryShrinkMemory() { diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h index e7f7732d974..2a8a0eb3ba8 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor.h +++ b/paddle/fluid/inference/api/onnxruntime_predictor.h @@ -18,8 +18,9 @@ #include #include #include -#include "paddle/fluid/framework/naive_executor.h" -#include "paddle/fluid/framework/op_compatible_info.h" + +#include "onnxruntime_c_api.h" // NOLINT +#include "onnxruntime_cxx_api.h" // NOLINT #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h" @@ -27,9 +28,6 @@ #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/string/printf.h" - -#include "onnxruntime_c_api.h" // NOLINT -#include "onnxruntime_cxx_api.h" // NOLINT #include "paddle2onnx/converter.h" #ifdef PADDLE_WITH_TESTING @@ -94,7 +92,22 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// \param[in] AnalysisConfig config /// explicit ONNXRuntimePredictor(const AnalysisConfig &config) - : config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") { + : env_(std::make_shared(ORT_LOGGING_LEVEL_WARNING, + "paddle-ort")), + session_(nullptr), + binding_(nullptr), + config_(config) { + predictor_id_ = inference::GetUniqueId(); + } + /// + /// \brief Clone a ONNXRuntime Predictor object + /// + /// \param[in] AnalysisConfig config + /// + explicit ONNXRuntimePredictor(const AnalysisConfig &config, + std::shared_ptr env, + std::shared_ptr session) + : env_(env), session_(session), binding_(nullptr), config_(config) { predictor_id_ = inference::GetUniqueId(); } /// @@ -102,6 +115,13 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// ~ONNXRuntimePredictor(); + /// + /// \brief Initialize ORT Binding + /// + /// \return Whether the init function executed successfully + /// + bool InitBinding(); + /// /// \brief Initialize predictor /// @@ -176,6 +196,8 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// std::unique_ptr Clone() override; + std::shared_ptr scope_; + private: /// /// \brief Whether to find in/out by name. @@ -188,18 +210,27 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// bool FindONNXDesc(const std::string &name, bool is_input); - private: - AnalysisConfig config_; + /// \brief get the Ort Value(input Tensor). + /// + /// \param[in] desc ONNXDesce(name、shape、dtype) + /// + /// \param[in] device_name "cpu" or "gpu" of device + /// + /// \return get a Ort::Value + /// + Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name); + private: // ONNXRuntime - Ort::Env env_; - Ort::Session session_{nullptr}; + std::shared_ptr env_; + std::shared_ptr session_{nullptr}; std::shared_ptr binding_; + AnalysisConfig config_; + std::mutex clone_mutex_; platform::Place place_; std::vector input_desc_; std::vector output_desc_; - std::map>> input_buffers_; int predictor_id_; // Some more detailed tests, they are made the friends of the predictor, so that diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index ce634ef08ca..35ff80b9cbf 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -183,7 +183,6 @@ class PD_INFER_DECL Tensor { #ifdef PADDLE_WITH_ONNXRUNTIME bool is_ort_tensor_{false}; std::vector shape_; - std::weak_ptr> buffer_; std::weak_ptr binding_; int idx_{-1}; @@ -191,7 +190,8 @@ class PD_INFER_DECL Tensor { void SetOrtBinding(const std::shared_ptr binding); - void SetOrtBuffer(const std::shared_ptr> buffer); + template + T* ORTGetMutableData(); template void ORTCopyFromCpu(const T* data); -- GitLab