// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/inference/api/onnxruntime_predictor.h" #include #include #include #include #include #include #include #include #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { paddle_infer::DataType ConvertONNXType(ONNXTensorElementDataType type) { switch (type) { case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: return paddle_infer::DataType::FLOAT32; case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: return paddle_infer::DataType::FLOAT16; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: return paddle_infer::DataType::INT8; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: return paddle_infer::DataType::INT32; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: return paddle_infer::DataType::INT64; case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: return paddle_infer::DataType::UINT8; default: LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast(type); return paddle_infer::DataType::FLOAT32; } } bool CheckConvertToONNX(const AnalysisConfig &config) { if (!config.model_dir().empty()) { LOG(ERROR) << "Paddle2ONNX not support model_dir config"; // TODO(heliqi jiangjiajun): Paddle2ONNX not support // config.model_dir() + "/__model__" // config.model_dir() + var_name return false; } else if (config.prog_file().empty() || config.params_file().empty()) { LOG(ERROR) << string::Sprintf( "not valid model path '%s' or program path '%s' or params path '%s'.", config.model_dir(), config.prog_file(), config.params_file()); return false; } if (config.model_from_memory()) { return paddle2onnx::IsExportable(config.prog_file().data(), config.prog_file().size(), config.params_file().data(), config.params_file().size()); } else { return paddle2onnx::IsExportable(config.prog_file().c_str(), config.params_file().c_str()); } } bool ONNXRuntimePredictor::InitBinding() { // Now ONNXRuntime only support CPU const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu"; if (config_.use_gpu()) { place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); } else { place_ = paddle::platform::CPUPlace(); } scope_.reset(new paddle::framework::Scope()); binding_ = std::make_shared(*session_); Ort::MemoryInfo memory_info( device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault); Ort::Allocator allocator(*session_, memory_info); size_t n_inputs = session_->GetInputCount(); framework::proto::VarType::Type proto_type = framework::proto::VarType::LOD_TENSOR; for (size_t i = 0; i < n_inputs; ++i) { auto input_name = session_->GetInputName(i, allocator); auto type_info = session_->GetInputTypeInfo(i); std::vector shape = type_info.GetTensorTypeAndShapeInfo().GetShape(); ONNXTensorElementDataType data_type = type_info.GetTensorTypeAndShapeInfo().GetElementType(); input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type}); auto *ptr = scope_->Var(input_name); framework::InitializeVariable(ptr, proto_type); allocator.Free(input_name); } size_t n_outputs = session_->GetOutputCount(); for (size_t i = 0; i < n_outputs; ++i) { auto output_name = session_->GetOutputName(i, allocator); auto type_info = session_->GetOutputTypeInfo(i); std::vector shape = type_info.GetTensorTypeAndShapeInfo().GetShape(); ONNXTensorElementDataType data_type = type_info.GetTensorTypeAndShapeInfo().GetElementType(); output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type}); Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault); binding_->BindOutput(output_name, out_memory_info); allocator.Free(output_name); } return true; } bool ONNXRuntimePredictor::Init() { VLOG(3) << "ONNXRuntime Predictor::init()"; char *onnx_proto = nullptr; int out_size; if (config_.model_from_memory()) { paddle2onnx::Export(config_.prog_file().data(), config_.prog_file().size(), config_.params_file().data(), config_.params_file().size(), &onnx_proto, &out_size); } else { paddle2onnx::Export(config_.prog_file().c_str(), config_.params_file().c_str(), &onnx_proto, &out_size); } Ort::SessionOptions session_options; if (config_.ort_optimization_enabled()) { session_options.SetGraphOptimizationLevel( GraphOptimizationLevel::ORT_ENABLE_ALL); } // Turn optimization off first, and then turn it on when it's stable // session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); // session_options.EnableCpuMemArena(); // session_options.EnableMemPattern(); // session_options.SetInterOpNumThreads(config_.cpu_math_library_num_threads()); session_options.SetIntraOpNumThreads(config_.cpu_math_library_num_threads()); VLOG(2) << "ONNXRuntime threads " << config_.cpu_math_library_num_threads(); if (config_.profile_enabled()) { LOG(WARNING) << "ONNXRuntime Profiler is activated, which might affect the " "performance"; #if defined(_WIN32) session_options.EnableProfiling(L"ONNX"); #else session_options.EnableProfiling("ONNX"); #endif } else { VLOG(2) << "ONNXRuntime Profiler is deactivated, and no profiling report " "will be " "generated."; } session_ = std::make_shared( *env_, onnx_proto, static_cast(out_size), session_options); InitBinding(); delete onnx_proto; onnx_proto = nullptr; return true; } template <> std::unique_ptr CreatePaddlePredictor( const AnalysisConfig &config) { if (config.glog_info_disabled()) { FLAGS_logtostderr = 1; FLAGS_minloglevel = 2; // GLOG_ERROR } PADDLE_ENFORCE_EQ( config.is_valid(), true, platform::errors::InvalidArgument( "Note: Each config can only be used for one predictor.")); VLOG(3) << "create ONNXRuntimePredictor"; std::unique_ptr predictor(new ONNXRuntimePredictor(config)); // Each config can only be used for one predictor. config.SetInValid(); auto predictor_p = dynamic_cast(predictor.get()); if (!predictor_p->Init()) { return nullptr; } return predictor; } std::vector ONNXRuntimePredictor::GetInputNames() { std::vector input_names; for (auto input_desc : input_desc_) { input_names.push_back(input_desc.name); } return input_names; } std::map> ONNXRuntimePredictor::GetInputTensorShape() { std::map> input_shapes; for (auto input_desc : input_desc_) { input_shapes[input_desc.name] = input_desc.shape; } return input_shapes; } std::vector ONNXRuntimePredictor::GetOutputNames() { std::vector output_names; for (auto output_desc : output_desc_) { output_names.push_back(output_desc.name); } return output_names; } bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name, bool is_input) { if (is_input) { for (auto i : input_desc_) if (i.name == name) return true; } else { for (auto i : output_desc_) if (i.name == name) return true; } return false; } std::unique_ptr ONNXRuntimePredictor::GetInputTensor( const std::string &name) { PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), platform::errors::PreconditionNotMet( "The in variable named %s is not found in the " "ONNXPredictor.", name)); std::unique_ptr res( new ZeroCopyTensor(static_cast(scope_.get()), this)); res->input_or_output_ = true; res->SetName(name); if (platform::is_cpu_place(place_)) { res->SetPlace(PaddlePlace::kCPU); } else { auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } return res; } std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( const std::string &name) { PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true, platform::errors::PreconditionNotMet( "The out variable named %s is not found in the " "ONNXPredictor.", name)); std::unique_ptr res(new ZeroCopyTensor(nullptr, this)); res->input_or_output_ = false; res->SetName(name); if (platform::is_cpu_place(place_)) { res->SetPlace(PaddlePlace::kCPU); } else { auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } res->SetOrtMark(true); res->SetOrtBinding(binding_); int size = output_desc_.size(); for (int i = 0; i < size; ++i) if (output_desc_[i].name == name) { res->idx_ = i; res->dtype_ = ConvertONNXType(output_desc_[i].dtype); break; } return res; } Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc, const char *device_name) { Ort::MemoryInfo memory_info( device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault); auto *var = scope_->FindVar(desc.name); auto *tensor = var->GetMutable(); size_t size = tensor->numel() * framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype())); std::vector shape = phi::vectorize(tensor->dims()); return Ort::Value::CreateTensor(memory_info, static_cast(tensor->data()), size, shape.data(), shape.size(), desc.dtype); } bool ONNXRuntimePredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { LOG(ERROR) << "Not support Run"; return false; } bool ONNXRuntimePredictor::ZeroCopyRun() { try { const char *device_name = platform::is_cpu_place(place_) ? "Cpu" : "Cuda"; std::vector inputs; inputs.reserve(input_desc_.size()); for (auto desc : input_desc_) { inputs.push_back(GetOrtValue(desc, device_name)); binding_->BindInput(desc.name.c_str(), inputs.back()); } for (auto output : output_desc_) { Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault); binding_->BindOutput(output.name.c_str(), out_memory_info); } session_->Run({}, *(binding_.get())); } catch (const std::exception &e) { LOG(ERROR) << e.what(); return false; } return true; } std::unique_ptr ONNXRuntimePredictor::Clone(void *stream) { std::lock_guard lk(clone_mutex_); auto *x = new ONNXRuntimePredictor(config_, env_, session_); x->InitBinding(); return std::unique_ptr(x); } uint64_t ONNXRuntimePredictor::TryShrinkMemory() { return paddle::memory::Release(place_); } ONNXRuntimePredictor::~ONNXRuntimePredictor() { binding_->ClearBoundInputs(); binding_->ClearBoundOutputs(); memory::Release(place_); } const void *ONNXRuntimePredictor::GetDeviceContexts() const { // TODO(inference): Support private device contexts. paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); const auto &dev_ctxs = pool.device_contexts(); return &const_cast< std::map>> &>( dev_ctxs); } } // namespace paddle