// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/inference/api/onnxruntime_predictor.h" #include #include #include #include #include #include #include #include #include "paddle/fluid//platform/device/gpu/gpu_types.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { paddle_infer::DataType ConvertONNXType(ONNXTensorElementDataType type) { switch (type) { case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: return paddle_infer::DataType::FLOAT32; case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: return paddle_infer::DataType::FLOAT16; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: return paddle_infer::DataType::INT8; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: return paddle_infer::DataType::INT32; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: return paddle_infer::DataType::INT64; case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: return paddle_infer::DataType::UINT8; default: LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast(type); return paddle_infer::DataType::FLOAT32; } } bool CheckConvertToONNX(const AnalysisConfig &config) { if (!config.model_dir().empty()) { LOG(ERROR) << "Paddle2ONNX not support model_dir config"; // TODO(heliqi jiangjiajun): Paddle2ONNX not support // config.model_dir() + "/__model__" // config.model_dir() + var_name return false; } else if (config.prog_file().empty() || config.params_file().empty()) { LOG(ERROR) << string::Sprintf( "not valid model path '%s' or program path '%s' or params path '%s'.", config.model_dir(), config.prog_file(), config.params_file()); return false; } if (config.model_from_memory()) { return paddle2onnx::IsExportable( config.prog_file().data(), config.prog_file().size(), config.params_file().data(), config.params_file().size()); } else { return paddle2onnx::IsExportable(config.prog_file().c_str(), config.params_file().c_str()); } } bool ONNXRuntimePredictor::Init() { VLOG(3) << "ONNXRuntime Predictor::init()"; // Now ONNXRuntime only support CPU const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu"; if (config_.use_gpu()) { place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); } else { place_ = paddle::platform::CPUPlace(); } char *onnx_proto = nullptr; int out_size; if (config_.model_from_memory()) { paddle2onnx::Export(config_.prog_file().data(), config_.prog_file().size(), config_.params_file().data(), config_.params_file().size(), &onnx_proto, &out_size); } else { paddle2onnx::Export(config_.prog_file().c_str(), config_.params_file().c_str(), &onnx_proto, &out_size); } Ort::SessionOptions session_options; if (config_.ort_optimization_enabled()) { session_options.SetGraphOptimizationLevel( GraphOptimizationLevel::ORT_ENABLE_ALL); } // Turn optimization off first, and then turn it on when it's stable // session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); // session_options.EnableCpuMemArena(); // session_options.EnableMemPattern(); // session_options.SetInterOpNumThreads(config_.cpu_math_library_num_threads()); session_options.SetIntraOpNumThreads(config_.cpu_math_library_num_threads()); VLOG(2) << "ONNXRuntime threads " << config_.cpu_math_library_num_threads(); if (config_.profile_enabled()) { LOG(WARNING) << "ONNXRuntime Profiler is activated, which might affect the " "performance"; #if defined(_WIN32) session_options.EnableProfiling(L"ONNX"); #else session_options.EnableProfiling("ONNX"); #endif } else { VLOG(2) << "ONNXRuntime Profiler is deactivated, and no profiling report " "will be " "generated."; } session_ = {env_, onnx_proto, static_cast(out_size), session_options}; binding_ = std::make_shared(session_); Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault); Ort::Allocator allocator(session_, memory_info); size_t n_inputs = session_.GetInputCount(); for (size_t i = 0; i < n_inputs; ++i) { auto input_name = session_.GetInputName(i, allocator); auto type_info = session_.GetInputTypeInfo(i); std::vector shape = type_info.GetTensorTypeAndShapeInfo().GetShape(); ONNXTensorElementDataType data_type = type_info.GetTensorTypeAndShapeInfo().GetElementType(); input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type}); allocator.Free(input_name); } size_t n_outputs = session_.GetOutputCount(); for (size_t i = 0; i < n_outputs; ++i) { auto output_name = session_.GetOutputName(i, allocator); auto type_info = session_.GetOutputTypeInfo(i); std::vector shape = type_info.GetTensorTypeAndShapeInfo().GetShape(); ONNXTensorElementDataType data_type = type_info.GetTensorTypeAndShapeInfo().GetElementType(); output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type}); Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault); binding_->BindOutput(output_name, out_memory_info); allocator.Free(output_name); } delete onnx_proto; onnx_proto = nullptr; return true; } template <> std::unique_ptr CreatePaddlePredictor( const AnalysisConfig &config) { if (config.glog_info_disabled()) { FLAGS_logtostderr = 1; FLAGS_minloglevel = 2; // GLOG_ERROR } PADDLE_ENFORCE_EQ( config.is_valid(), true, platform::errors::InvalidArgument( "Note: Each config can only be used for one predictor.")); VLOG(3) << "create ONNXRuntimePredictor"; std::unique_ptr predictor(new ONNXRuntimePredictor(config)); // Each config can only be used for one predictor. config.SetInValid(); auto predictor_p = dynamic_cast(predictor.get()); if (!predictor_p->Init()) { return nullptr; } return predictor; } std::vector ONNXRuntimePredictor::GetInputNames() { std::vector input_names; for (auto input_desc : input_desc_) { input_names.push_back(input_desc.name); } return input_names; } std::map> ONNXRuntimePredictor::GetInputTensorShape() { std::map> input_shapes; for (auto input_desc : input_desc_) { input_shapes[input_desc.name] = input_desc.shape; } return input_shapes; } std::vector ONNXRuntimePredictor::GetOutputNames() { std::vector output_names; for (auto output_desc : output_desc_) { output_names.push_back(output_desc.name); } return output_names; } bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name, bool is_input) { if (is_input) { for (auto i : input_desc_) if (i.name == name) return true; } else { for (auto i : output_desc_) if (i.name == name) return true; } return false; } std::unique_ptr ONNXRuntimePredictor::GetInputTensor( const std::string &name) { PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true, platform::errors::PreconditionNotMet( "The in variable named %s is not found in the " "ONNXPredictor.", name)); std::unique_ptr res(new ZeroCopyTensor(nullptr)); res->input_or_output_ = true; res->SetName(name); if (platform::is_cpu_place(place_)) { res->SetPlace(PaddlePlace::kCPU); } else { auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } res->SetOrtMark(true); res->SetOrtBinding(binding_); auto iter = input_buffers_.find(name); if (iter == input_buffers_.end()) { std::vector i_vector; input_buffers_[name] = std::make_shared>(i_vector); res->SetOrtBuffer(input_buffers_[name]); } else { res->SetOrtBuffer(iter->second); } return res; } std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( const std::string &name) { PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true, platform::errors::PreconditionNotMet( "The out variable named %s is not found in the " "ONNXPredictor.", name)); std::unique_ptr res(new ZeroCopyTensor(nullptr)); res->input_or_output_ = false; res->SetName(name); if (platform::is_cpu_place(place_)) { res->SetPlace(PaddlePlace::kCPU); } else { auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } res->SetOrtMark(true); res->SetOrtBinding(binding_); int size = output_desc_.size(); for (int i = 0; i < size; ++i) if (output_desc_[i].name == name) { res->idx_ = i; res->dtype_ = ConvertONNXType(output_desc_[i].dtype); break; } return res; } bool ONNXRuntimePredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { LOG(ERROR) << "Not support Run"; return false; } bool ONNXRuntimePredictor::ZeroCopyRun() { try { const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda"; for (auto output : output_desc_) { Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault); binding_->BindOutput(output.name.c_str(), out_memory_info); } session_.Run({}, *(binding_.get())); } catch (const std::exception &e) { LOG(ERROR) << e.what(); return false; } return true; } std::unique_ptr ONNXRuntimePredictor::Clone() { LOG(ERROR) << "Not support Clone(), Please create new Predictor"; return nullptr; } uint64_t ONNXRuntimePredictor::TryShrinkMemory() { return paddle::memory::Release(place_); } ONNXRuntimePredictor::~ONNXRuntimePredictor() { binding_->ClearBoundInputs(); binding_->ClearBoundOutputs(); memory::Release(place_); } } // namespace paddle