[Inference]Fix the ort Backend multiple input bug (#43621)

* fix or backend many inputs bug * fix or backend many inputs bug * fix or backend many inputs bug * fix or backend many inputs bug * code format * code format

[Inference]Fix the ort Backend multiple input bug (#43621)
* fix or backend many inputs bug * fix or backend many inputs bug * fix or backend many inputs bug * fix or backend many inputs bug * code format * code format
61591afe · heliqi · GitHub · 75144f13 · 61591afe · 61591afe
9 changed file
--- a/cmake/external/onnxruntime.cmake
+++ b/cmake/external/onnxruntime.cmake
@@ -52,8 +52,9 @@ else()
  )
 endif()
-include_directories(${ONNXRUNTIME_INC_DIR}
+# For ONNXRUNTIME code to include internal headers.
-)# For ONNXRUNTIME code to include internal headers.
+include_directories(${ONNXRUNTIME_INC_DIR})
 if(WIN32)
  set(ONNXRUNTIME_SOURCE_LIB
      "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll"

--- a/cmake/external/paddle2onnx.cmake
+++ b/cmake/external/paddle2onnx.cmake
@@ -34,15 +34,11 @@ set(PADDLE2ONNX_INC_DIR
 set(PADDLE2ONNX_LIB_DIR
    "${PADDLE2ONNX_INSTALL_DIR}/lib"
    CACHE PATH "onnxruntime lib directory." FORCE)
-set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}"
+set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_LIB_DIR}")
-                      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}")
-include_directories(${PADDLE2ONNX_INC_DIR}
+# For PADDLE2ONNX code to include internal headers.
-)# For PADDLE2ONNX code to include internal headers.
+include_directories(${PADDLE2ONNX_INC_DIR})
 if(WIN32)
-  set(PADDLE2ONNX_SOURCE_LIB
-      "${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.dylib"
-      CACHE FILEPATH "Paddle2ONNX source library." FORCE)
  set(PADDLE2ONNX_LIB
      "${PADDLE2ONNX_INSTALL_DIR}/lib/paddle2onnx.dll"
      CACHE FILEPATH "paddle2onnx library." FORCE)
@@ -50,9 +46,6 @@ if(WIN32)
      "${PADDLE2ONNX_INSTALL_DIR}/lib/paddle2onnx.lib"
      CACHE FILEPATH "paddle2onnx compile library." FORCE)
 elseif(APPLE)
-  set(PADDLE2ONNX_SOURCE_LIB
-      "${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.dylib"
-      CACHE FILEPATH "Paddle2ONNX source library." FORCE)
  set(PADDLE2ONNX_LIB
      "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.dylib"
      CACHE FILEPATH "PADDLE2ONNX library." FORCE)
@@ -60,9 +53,6 @@ elseif(APPLE)
      "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.dylib"
      CACHE FILEPATH "paddle2onnx compile library." FORCE)
 else()
-  set(PADDLE2ONNX_SOURCE_LIB
-      "${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.so"
-      CACHE FILEPATH "Paddle2ONNX source library." FORCE)
  set(PADDLE2ONNX_LIB
      "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.so"
      CACHE FILEPATH "PADDLE2ONNX library." FORCE)

--- a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
@@ -13,17 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 /*
- * This file contains demo of mobilenet for tensorrt.
+ * This file contains demo of mobilenet for onnxruntime backend.
 */
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+#include <algorithm>
+#include <numeric>
 #include <vector>
 #include "gflags/gflags.h"
 #include "utils.h"  // NOLINT
 DEFINE_string(modeldir, "", "Directory of the inference model.");
+DEFINE_string(data, "", "path of data");
 namespace paddle {
 namespace demo {
@@ -39,8 +41,21 @@ void Main() {
  auto predictor = paddle_infer::CreatePredictor(config);
  // Inference.
+  LOG(INFO) << "--- prepare input data ----";
  std::vector<int> input_shape = {1, 3, 224, 224};
-  std::vector<float> input_data(1 * 3 * 224 * 224, 1.0);
+  std::vector<float> input_data;
+  std::string line;
+  std::ifstream file(FLAGS_data);
+  std::getline(file, line);
+  file.close();
+  std::vector<std::string> data_strs;
+  split(line, ' ', &data_strs);
+  int input_num = 0;
+  for (auto& d : data_strs) {
+    input_num += 1;
+    input_data.push_back(std::stof(d));
+  }
  std::vector<float> out_data;
  out_data.resize(1000);
  auto input_names = predictor->GetInputNames();
@@ -53,7 +68,19 @@ void Main() {
  predictor->Run();
  output_tensor->CopyToCpu(out_data.data());
-  VLOG(3) << "output.size " << out_data.size();
+  std::vector<int> out_index(out_data.size());
+  std::iota(out_index.begin(), out_index.end(), 0);
+  std::sort(
+      out_index.begin(), out_index.end(), [&out_data](int index1, int index2) {
+        return out_data[index1] > out_data[index2];
+      });
+  LOG(INFO) << "output.size " << out_data.size()
+            << "  max_index:" << out_index[0];
+  CHECK_EQ(out_data.size(), 1000);
+  int max_index = out_index[0];
+  CHECK_EQ(max_index, 13);
+  float max_score = out_data[max_index];
+  CHECK_LE(fabs(max_score - 0.99981), 1e-4);
 }
 }  // namespace demo

--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -52,15 +52,17 @@ if [ $7 == ON ]; then
  mkdir -p MobileNetV2
  cd MobileNetV2
  if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then
-    echo "MobileNetV2.inference.model.tar.gz has been downloaded."
+    rm -rf MobileNetV2.inference.model.tar.gz
-  else
+  fi
+    # echo "MobileNetV2.inference.model.tar.gz has been downloaded."
+  # else
    if [ $WIN_DETECT != "" ]; then
      wget -q -Y off http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
    else
      wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
    fi
    tar xzf *.tar.gz
-  fi
+  # fi
  cd ..
 fi
@@ -265,7 +267,8 @@ for WITH_STATIC_LIB in ON OFF; do
        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
      make -j$(nproc)
      ./onnxruntime_mobilenet_demo \
-        --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2
+        --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2 \
+        --data=$DATA_DIR/MobileNetV2/MobileNetV2/data.txt
      if [ $? -ne 0 ]; then
        echo "onnxruntime_mobilenet_demo runs failed " >> ${current_dir}/test_summary.txt
        EXIT_CODE=1

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -40,17 +40,20 @@ void Tensor::Reshape(const std::vector<int> &shape) {
 #endif
  PADDLE_ENFORCE_EQ(
-      name_.empty(), false,
+      name_.empty(),
+      false,
      paddle::platform::errors::PreconditionNotMet(
          "Need to SetName first, so that the corresponding tensor can "
          "be retrieved."));
-  PADDLE_ENFORCE_EQ(input_or_output_, true,
+  PADDLE_ENFORCE_EQ(input_or_output_,
+                    true,
                    paddle::platform::errors::PermissionDenied(
                        "Can't reshape the output tensor, it is readonly"));
  auto *scope = static_cast<paddle::framework::Scope *>(scope_);
  auto *var = scope->FindVar(name_);
  PADDLE_ENFORCE_NOT_NULL(
-      var, paddle::platform::errors::PreconditionNotMet(
+      var,
+      paddle::platform::errors::PreconditionNotMet(
          "No tensor called [%s] in the runtime scope", name_));
  auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
  tensor->Resize(phi::make_ddim(shape));
@@ -58,17 +61,20 @@ void Tensor::Reshape(const std::vector<int> &shape) {
 void Tensor::ReshapeStrings(const size_t &shape) {
  PADDLE_ENFORCE_EQ(
-      name_.empty(), false,
+      name_.empty(),
+      false,
      paddle::platform::errors::PreconditionNotMet(
          "Need to SetName first, so that the corresponding tensor can "
          "be retrieved."));
-  PADDLE_ENFORCE_EQ(input_or_output_, true,
+  PADDLE_ENFORCE_EQ(input_or_output_,
+                    true,
                    paddle::platform::errors::PermissionDenied(
                        "Can't reshape the output tensor, it is readonly"));
  auto *scope = static_cast<paddle::framework::Scope *>(scope_);
  auto *var = scope->FindVar(name_);
  PADDLE_ENFORCE_NOT_NULL(
-      var, paddle::platform::errors::PreconditionNotMet(
+      var,
+      paddle::platform::errors::PreconditionNotMet(
          "No tensor called [%s] in the runtime scope", name_));
  paddle_infer::Strings *tensor = var->GetMutable<paddle_infer::Strings>();
  tensor->resize(shape);
@@ -84,7 +90,8 @@ template <typename T>
 T *Tensor::mutable_data(PlaceType place) {
  EAGER_GET_TENSOR(paddle::framework::LoDTensor);
  PADDLE_ENFORCE_GT(
-      tensor->numel(), 0,
+      tensor->numel(),
+      0,
      paddle::platform::errors::PreconditionNotMet(
          "You should call Tensor::Reshape(const std::vector<int> "
          "&shape)"
@@ -97,8 +104,9 @@ T *Tensor::mutable_data(PlaceType place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      paddle::platform::CUDAPlace gpu_place(device_);
      auto *dev_ctxs = reinterpret_cast<const std::map<
-          phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>>
+          phi::Place,
-                                            *>(device_contexs_);
+          std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
+          device_contexs_);
      auto *dev_ctx =
          static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
      return dev_ctx->Alloc<T>(tensor, tensor->numel() * sizeof(T));
@@ -179,7 +187,8 @@ void Tensor::CopyFromCpu(const T *data) {
 #endif
  EAGER_GET_TENSOR(paddle::framework::LoDTensor);
-  PADDLE_ENFORCE_GE(tensor->numel(), 0,
+  PADDLE_ENFORCE_GE(tensor->numel(),
+                    0,
                    paddle::platform::errors::PreconditionNotMet(
                        "You should call Tensor::Reshape(const "
                        "std::vector<int> &shape)"
@@ -194,14 +203,18 @@ void Tensor::CopyFromCpu(const T *data) {
    paddle::platform::CUDAPlace gpu_place(device_);
    auto *dev_ctxs = reinterpret_cast<const std::map<
-        phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
+        phi::Place,
+        std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
        device_contexs_);
    auto *dev_ctx =
        static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
    auto *t_data = dev_ctx->Alloc<T>(tensor, tensor->numel() * sizeof(T));
-    paddle::memory::Copy(gpu_place, static_cast<void *>(t_data),
+    paddle::memory::Copy(gpu_place,
-                         paddle::platform::CPUPlace(), data, ele_size,
+                         static_cast<void *>(t_data),
+                         paddle::platform::CPUPlace(),
+                         data,
+                         ele_size,
                         dev_ctx->stream());
 #else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
@@ -212,8 +225,11 @@ void Tensor::CopyFromCpu(const T *data) {
 #ifdef PADDLE_WITH_XPU
    paddle::platform::XPUPlace xpu_place(device_);
    auto *t_data = tensor->mutable_data<T>(xpu_place);
-    paddle::memory::Copy(xpu_place, static_cast<void *>(t_data),
+    paddle::memory::Copy(xpu_place,
-                         paddle::platform::CPUPlace(), data, ele_size);
+                         static_cast<void *>(t_data),
+                         paddle::platform::CPUPlace(),
+                         data,
+                         ele_size);
 #else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with XPU place because paddle is not compiled "
@@ -227,8 +243,11 @@ void Tensor::CopyFromCpu(const T *data) {
    auto *t_data = tensor->mutable_data<T>(npu_place);
    auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
        pool.Get(npu_place));
-    paddle::memory::Copy(npu_place, static_cast<void *>(t_data),
+    paddle::memory::Copy(npu_place,
-                         paddle::platform::CPUPlace(), data, ele_size,
+                         static_cast<void *>(t_data),
+                         paddle::platform::CPUPlace(),
+                         data,
+                         ele_size,
                         dev_ctx->stream());
 #else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
@@ -246,8 +265,11 @@ void Tensor::CopyFromCpu(const T *data) {
    auto *t_data = tensor->mutable_data<T>(custom_place);
    auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
        pool.Get(custom_place));
-    paddle::memory::Copy(custom_place, static_cast<void *>(t_data),
+    paddle::memory::Copy(custom_place,
-                         paddle::platform::CPUPlace(), data, ele_size,
+                         static_cast<void *>(t_data),
+                         paddle::platform::CPUPlace(),
+                         data,
+                         ele_size,
                         dev_ctx->stream());
 #else
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
@@ -291,30 +313,33 @@ struct DataTypeInfo<int32_t> {
 paddle::experimental::DataLayout LayoutConvert(DataLayout layout) {
  PADDLE_ENFORCE_EQ(
-      layout, DataLayout::kNCHW,
+      layout,
+      DataLayout::kNCHW,
      paddle::platform::errors::InvalidArgument("Only NCHW is supported now."));
  return paddle::experimental::DataLayout::NCHW;
 }
 template <typename T>
-void Tensor::ShareExternalData(const T *data, const std::vector<int> &shape,
+void Tensor::ShareExternalData(const T *data,
-                               PlaceType place, DataLayout layout) {
+                               const std::vector<int> &shape,
+                               PlaceType place,
+                               DataLayout layout) {
  EAGER_GET_TENSOR(paddle::framework::LoDTensor)
  size_t size =
      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
      sizeof(T);
-  phi::DenseTensorMeta meta(DataTypeInfo<T>().TYPE, phi::make_ddim(shape),
+  phi::DenseTensorMeta meta(
-                            LayoutConvert(layout));
+      DataTypeInfo<T>().TYPE, phi::make_ddim(shape), LayoutConvert(layout));
  if (place == PlaceType::kCPU) {
    phi::DenseTensor dtensor(
-        std::make_shared<phi::Allocation>(const_cast<T *>(data), size,
+        std::make_shared<phi::Allocation>(
-                                          paddle::platform::CPUPlace()),
+            const_cast<T *>(data), size, paddle::platform::CPUPlace()),
        meta);
    *tensor = std::move(dtensor);
  } else if (place == PlaceType::kGPU) {
    phi::DenseTensor dtensor(
-        std::make_shared<phi::Allocation>(const_cast<T *>(data), size,
+        std::make_shared<phi::Allocation>(
-                                          paddle::platform::CUDAPlace(device_)),
+            const_cast<T *>(data), size, paddle::platform::CUDAPlace(device_)),
        meta);
    *tensor = std::move(dtensor);
  } else {
@@ -325,7 +350,8 @@ void Tensor::ShareExternalData(const T *data, const std::vector<int> &shape,
 void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
  EAGER_GET_TENSOR(paddle_infer::Strings);
-  PADDLE_ENFORCE_GE(tensor->size(), 0,
+  PADDLE_ENFORCE_GE(tensor->size(),
+                    0,
                    paddle::platform::errors::PreconditionNotMet(
                        "You should call Tensor::Reshape(const "
                        "std::size_t &shape)function before copying"
@@ -334,7 +360,9 @@ void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
 }
 template <typename T>
-void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
+void Tensor::CopyToCpuImpl(T *data,
+                           void *exec_stream,
+                           CallbackFunc cb,
                           void *cb_params) const {
  EAGER_GET_TENSOR(paddle::framework::LoDTensor);
  auto ele_num = tensor->numel();
@@ -344,7 +372,8 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
  paddle::framework::Tensor out;
  auto mem_allocation =
      std::make_shared<paddle::memory::allocation::Allocation>(
-          static_cast<void *>(data), ele_num * sizeof(T),
+          static_cast<void *>(data),
+          ele_num * sizeof(T),
          paddle::platform::CPUPlace());
  out.ResetHolder(mem_allocation);
@@ -355,7 +384,10 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
          tensor->layout(),
          paddle::platform::MKLDNNDeviceContext::tls()
              .get_cur_paddle_data_layout(),
-          *tensor, &out, paddle::platform::CPUPlace(), true);
+          *tensor,
+          &out,
+          paddle::platform::CPUPlace(),
+          true);
    else
      std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
 #else
@@ -373,13 +405,17 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    auto gpu_place = t_place;
    auto *dev_ctxs = reinterpret_cast<const std::map<
-        phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
+        phi::Place,
+        std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
        device_contexs_);
    auto *dev_ctx =
        static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
    paddle::memory::Copy(paddle::platform::CPUPlace(),
-                         static_cast<void *>(data), gpu_place, t_data,
+                         static_cast<void *>(data),
-                         ele_num * sizeof(T), dev_ctx->stream());
+                         gpu_place,
+                         t_data,
+                         ele_num * sizeof(T),
+                         dev_ctx->stream());
 #ifdef PADDLE_WITH_HIP
    hipStreamSynchronize(dev_ctx->stream());
 #else
@@ -403,7 +439,9 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #ifdef PADDLE_WITH_XPU
    auto xpu_place = t_place;
    paddle::memory::Copy(paddle::platform::CPUPlace(),
-                         static_cast<void *>(data), xpu_place, t_data,
+                         static_cast<void *>(data),
+                         xpu_place,
+                         t_data,
                         ele_num * sizeof(T));
 #else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
@@ -418,8 +456,11 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
    auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
        pool.Get(npu_place));
    paddle::memory::Copy(paddle::platform::CPUPlace(),
-                         static_cast<void *>(data), npu_place, t_data,
+                         static_cast<void *>(data),
-                         ele_num * sizeof(T), dev_ctx->stream());
+                         npu_place,
+                         t_data,
+                         ele_num * sizeof(T),
+                         dev_ctx->stream());
    paddle::platform::NPUStreamSync(dev_ctx->stream());
 #else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
@@ -434,8 +475,11 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
    auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
        pool.Get(custom_place));
    paddle::memory::Copy(paddle::platform::CPUPlace(),
-                         static_cast<void *>(data), custom_place, t_data,
+                         static_cast<void *>(data),
-                         ele_num * sizeof(T), dev_ctx->stream());
+                         custom_place,
+                         t_data,
+                         ele_num * sizeof(T),
+                         dev_ctx->stream());
 // TODO(wangran16): sync_stream
 #else
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
@@ -474,22 +518,34 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
 template PD_INFER_DECL void Tensor::ShareExternalData<float>(
-    const float *data, const std::vector<int> &shape, PlaceType place,
+    const float *data,
+    const std::vector<int> &shape,
+    PlaceType place,
    DataLayout layout);
 template PD_INFER_DECL void Tensor::ShareExternalData<int64_t>(
-    const int64_t *data, const std::vector<int> &shape, PlaceType place,
+    const int64_t *data,
+    const std::vector<int> &shape,
+    PlaceType place,
    DataLayout layout);
 template PD_INFER_DECL void Tensor::ShareExternalData<int32_t>(
-    const int32_t *data, const std::vector<int> &shape, PlaceType place,
+    const int32_t *data,
+    const std::vector<int> &shape,
+    PlaceType place,
    DataLayout layout);
 template PD_INFER_DECL void Tensor::ShareExternalData<uint8_t>(
-    const uint8_t *data, const std::vector<int> &shape, PlaceType place,
+    const uint8_t *data,
+    const std::vector<int> &shape,
+    PlaceType place,
    DataLayout layout);
 template PD_INFER_DECL void Tensor::ShareExternalData<int8_t>(
-    const int8_t *data, const std::vector<int> &shape, PlaceType place,
+    const int8_t *data,
+    const std::vector<int> &shape,
+    PlaceType place,
    DataLayout layout);
 template PD_INFER_DECL void Tensor::ShareExternalData<float16>(
-    const float16 *data, const std::vector<int> &shape, PlaceType place,
+    const float16 *data,
+    const std::vector<int> &shape,
+    PlaceType place,
    DataLayout layout);
 template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
@@ -566,14 +622,16 @@ Tensor::Tensor(void *scope, const void *device_contexts)
 template <typename T>
 void *Tensor::FindTensor() const {
  PADDLE_ENFORCE_EQ(
-      name_.empty(), false,
+      name_.empty(),
+      false,
      paddle::platform::errors::PreconditionNotMet(
          "Need to SetName first, so that the corresponding tensor can "
          "be retrieved."));
  auto *scope = static_cast<paddle::framework::Scope *>(scope_);
  auto *var = scope->FindVar(name_);
  PADDLE_ENFORCE_NOT_NULL(
-      var, paddle::platform::errors::PreconditionNotMet(
+      var,
+      paddle::platform::errors::PreconditionNotMet(
          "No tensor called [%s] in the runtime scope", name_));
  auto *tensor = var->GetMutable<T>();
  return tensor;
@@ -602,7 +660,8 @@ std::vector<int> Tensor::shape() const {
 #endif
  EAGER_GET_TENSOR(paddle::framework::LoDTensor);
  PADDLE_ENFORCE_NOT_NULL(
-      tensor_, paddle::platform::errors::PreconditionNotMet(
+      tensor_,
+      paddle::platform::errors::PreconditionNotMet(
          "Not found tensor called %s in the scope", name_));
 // mkldnn may does layout transform internally, so need to reorder before
 // return
@@ -668,40 +727,65 @@ void Tensor::SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding) {
  binding_ = binding;
 }
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float *data,
+void Tensor::SetOrtBuffer(const std::shared_ptr<std::vector<int8_t>> buffer) {
-                       size_t size, const int64_t *shape, size_t shape_len) {
+  buffer_ = buffer;
-  return Ort::Value::CreateTensor<float>(memory_info, data, size, shape,
+}
-                                         shape_len);
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
+                       float *data,
+                       size_t size,
+                       const int64_t *shape,
+                       size_t shape_len) {
+  return Ort::Value::CreateTensor<float>(
+      memory_info, data, size, shape, shape_len);
 }
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int64_t *data,
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
-                       size_t size, const int64_t *shape, size_t shape_len) {
+                       int64_t *data,
-  return Ort::Value::CreateTensor<int64_t>(memory_info, data, size, shape,
+                       size_t size,
-                                           shape_len);
+                       const int64_t *shape,
+                       size_t shape_len) {
+  return Ort::Value::CreateTensor<int64_t>(
+      memory_info, data, size, shape, shape_len);
 }
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int32_t *data,
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
-                       size_t size, const int64_t *shape, size_t shape_len) {
+                       int32_t *data,
-  return Ort::Value::CreateTensor<int32_t>(memory_info, data, size, shape,
+                       size_t size,
-                                           shape_len);
+                       const int64_t *shape,
+                       size_t shape_len) {
+  return Ort::Value::CreateTensor<int32_t>(
+      memory_info, data, size, shape, shape_len);
 }
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, uint8_t *data,
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
-                       size_t size, const int64_t *shape, size_t shape_len) {
+                       uint8_t *data,
-  return Ort::Value::CreateTensor<uint8_t>(memory_info, data, size, shape,
+                       size_t size,
-                                           shape_len);
+                       const int64_t *shape,
+                       size_t shape_len) {
+  return Ort::Value::CreateTensor<uint8_t>(
+      memory_info, data, size, shape, shape_len);
 }
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int8_t *data,
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
-                       size_t size, const int64_t *shape, size_t shape_len) {
+                       int8_t *data,
-  return Ort::Value::CreateTensor<int8_t>(memory_info, data, size, shape,
+                       size_t size,
-                                          shape_len);
+                       const int64_t *shape,
+                       size_t shape_len) {
+  return Ort::Value::CreateTensor<int8_t>(
+      memory_info, data, size, shape, shape_len);
 }
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float16 *data,
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
-                       size_t size, const int64_t *shape, size_t shape_len) {
+                       float16 *data,
-  return Ort::Value::CreateTensor(memory_info, static_cast<void *>(data),
+                       size_t size,
-                                  size * sizeof(float16), shape, shape_len,
+                       const int64_t *shape,
+                       size_t shape_len) {
+  return Ort::Value::CreateTensor(memory_info,
+                                  static_cast<void *>(data),
+                                  size * sizeof(float16),
+                                  shape,
+                                  shape_len,
                                  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
 }
@@ -712,15 +796,16 @@ void Tensor::ORTCopyFromCpu(const T *data) {
                          paddle::platform::errors::PreconditionNotMet(
                              "input tensor [%s] no binding ptr", name_));
  const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
-  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, device_,
+  Ort::MemoryInfo memory_info(
-                              OrtMemTypeDefault);
+      device_name, OrtDeviceAllocator, device_, OrtMemTypeDefault);
-  size_t size = std::accumulate(begin(shape_), end(shape_), 1UL,
+  size_t size = std::accumulate(
-                                std::multiplies<size_t>());
+      begin(shape_), end(shape_), 1UL, std::multiplies<size_t>());
+  auto buffer = buffer_.lock();
  size_t buffer_size = size * sizeof(T);
-  if (buffer_size > buffer_.size()) {
+  if (buffer_size > buffer->size()) {
-    buffer_.resize(buffer_size);
+    buffer->resize(buffer_size);
  }
-  std::memcpy(static_cast<void *>(buffer_.data()), data, buffer_size);
+  std::memcpy(static_cast<void *>(buffer->data()), data, buffer_size);
  auto onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
  if (std::is_same<T, float>::value) {
@@ -737,18 +822,18 @@ void Tensor::ORTCopyFromCpu(const T *data) {
    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8;
  } else if (std::is_same<T, float16>::value) {
    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
-  }
+  } else {
-  if (onnx_dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
        "Found undefined data type for onnxruntime, only supports "
        "float16/float32/float64/int8/uint8/int32/int64."));
  }
-  auto ort_value =
+  auto ort_value = Ort::Value::CreateTensor(memory_info,
-      Ort::Value::CreateTensor(memory_info, buffer_.data(), buffer_size,
+                                            buffer->data(),
-                               shape_.data(), shape_.size(), onnx_dtype);
+                                            buffer_size,
+                                            shape_.data(),
+                                            shape_.size(),
+                                            onnx_dtype);
  binding->BindInput(name_.c_str(), ort_value);
 }
@@ -793,21 +878,24 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
                                            cudaStream_t stream) {
  if (t->tensor_ == nullptr) {
    PADDLE_ENFORCE_EQ(
-        t->name_.empty(), false,
+        t->name_.empty(),
+        false,
        paddle::platform::errors::PreconditionNotMet(
            "Need to SetName first, so that the corresponding tensor can "
            "be retrieved."));
    auto *scope = static_cast<paddle::framework::Scope *>(t->scope_);
    auto *var = scope->FindVar(t->name_);
    PADDLE_ENFORCE_NOT_NULL(
-        var, paddle::platform::errors::PreconditionNotMet(
+        var,
+        paddle::platform::errors::PreconditionNotMet(
            "No tensor called [%s] in the runtime scope", t->name_));
    auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
    t->tensor_ = tensor;
  }
  auto *tensor = static_cast<paddle::framework::LoDTensor *>(t->tensor_);
-  PADDLE_ENFORCE_GE(tensor->numel(), 0,
+  PADDLE_ENFORCE_GE(tensor->numel(),
+                    0,
                    paddle::platform::errors::PreconditionNotMet(
                        "You should call Tensor::Reshape(const "
                        "std::vector<int> &shape)"
@@ -820,8 +908,12 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    paddle::platform::CUDAPlace gpu_place(t->device_);
    auto *t_data = tensor->mutable_data<T>(gpu_place);
-    paddle::memory::Copy(gpu_place, static_cast<void *>(t_data),
+    paddle::memory::Copy(gpu_place,
-                         paddle::platform::CPUPlace(), data, ele_size, stream);
+                         static_cast<void *>(t_data),
+                         paddle::platform::CPUPlace(),
+                         data,
+                         ele_size,
+                         stream);
 #else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with CUDA place because paddle is not compiled "
@@ -834,18 +926,21 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
 }
 template <typename T>
-void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data,
+void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t,
+                                          T *data,
                                          cudaStream_t stream) {
  if (t->tensor_ == nullptr) {
    PADDLE_ENFORCE_EQ(
-        t->name_.empty(), false,
+        t->name_.empty(),
+        false,
        paddle::platform::errors::PreconditionNotMet(
            "Need to SetName first, so that the corresponding tensor can "
            "be retrieved."));
    auto *scope = static_cast<paddle::framework::Scope *>(t->scope_);
    auto *var = scope->FindVar(t->name_);
    PADDLE_ENFORCE_NOT_NULL(
-        var, paddle::platform::errors::PreconditionNotMet(
+        var,
+        paddle::platform::errors::PreconditionNotMet(
            "No tensor called [%s] in the runtime scope", t->name_));
    auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
    t->tensor_ = tensor;
@@ -859,7 +954,8 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data,
  paddle::framework::Tensor out;
  auto mem_allocation =
      std::make_shared<paddle::memory::allocation::Allocation>(
-          static_cast<void *>(data), ele_num * sizeof(T),
+          static_cast<void *>(data),
+          ele_num * sizeof(T),
          paddle::platform::CPUPlace());
  out.ResetHolder(mem_allocation);
@@ -870,7 +966,10 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data,
          tensor->layout(),
          paddle::platform::MKLDNNDeviceContext::tls()
              .get_cur_paddle_data_layout(),
-          *tensor, &out, paddle::platform::CPUPlace(), true);
+          *tensor,
+          &out,
+          paddle::platform::CPUPlace(),
+          true);
    else
      std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
 #else
@@ -879,8 +978,11 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data,
  } else if (t->place_ == PlaceType::kGPU) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    paddle::memory::Copy(paddle::platform::CPUPlace(),
-                         static_cast<void *>(data), t_place, t_data,
+                         static_cast<void *>(data),
-                         ele_num * sizeof(T), stream);
+                         t_place,
+                         t_data,
+                         ele_num * sizeof(T),
+                         stream);
 #else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with CUDA place because paddle is not compiled "

--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -71,13 +71,16 @@ bool CheckConvertToONNX(const AnalysisConfig &config) {
  } else if (config.prog_file().empty() || config.params_file().empty()) {
    LOG(ERROR) << string::Sprintf(
        "not valid model path '%s' or program path '%s' or params path '%s'.",
-        config.model_dir(), config.prog_file(), config.params_file());
+        config.model_dir(),
+        config.prog_file(),
+        config.params_file());
    return false;
  }
  if (config.model_from_memory()) {
-    return paddle2onnx::IsExportable(
+    return paddle2onnx::IsExportable(config.prog_file().data(),
-        config.prog_file().data(), config.prog_file().size(),
+                                     config.prog_file().size(),
-        config.params_file().data(), config.params_file().size());
+                                     config.params_file().data(),
+                                     config.params_file().size());
  } else {
    return paddle2onnx::IsExportable(config.prog_file().c_str(),
                                     config.params_file().c_str());
@@ -98,12 +101,17 @@ bool ONNXRuntimePredictor::Init() {
  char *onnx_proto = nullptr;
  int out_size;
  if (config_.model_from_memory()) {
-    paddle2onnx::Export(config_.prog_file().data(), config_.prog_file().size(),
+    paddle2onnx::Export(config_.prog_file().data(),
+                        config_.prog_file().size(),
                        config_.params_file().data(),
-                        config_.params_file().size(), &onnx_proto, &out_size);
+                        config_.params_file().size(),
+                        &onnx_proto,
+                        &out_size);
  } else {
    paddle2onnx::Export(config_.prog_file().c_str(),
-                        config_.params_file().c_str(), &onnx_proto, &out_size);
+                        config_.params_file().c_str(),
+                        &onnx_proto,
+                        &out_size);
  }
  Ort::SessionOptions session_options;
@@ -134,8 +142,8 @@ bool ONNXRuntimePredictor::Init() {
  session_ = {env_, onnx_proto, static_cast<size_t>(out_size), session_options};
  binding_ = std::make_shared<Ort::IoBinding>(session_);
-  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+  Ort::MemoryInfo memory_info(
-                              place_.GetDeviceId(), OrtMemTypeDefault);
+      device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault);
  Ort::Allocator allocator(session_, memory_info);
  size_t n_inputs = session_.GetInputCount();
@@ -160,8 +168,10 @@ bool ONNXRuntimePredictor::Init() {
        type_info.GetTensorTypeAndShapeInfo().GetElementType();
    output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type});
-    Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator,
+    Ort::MemoryInfo out_memory_info(device_name,
-                                    place_.GetDeviceId(), OrtMemTypeDefault);
+                                    OrtDeviceAllocator,
+                                    place_.GetDeviceId(),
+                                    OrtMemTypeDefault);
    binding_->BindOutput(output_name, out_memory_info);
    allocator.Free(output_name);
@@ -181,7 +191,8 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
  }
  PADDLE_ENFORCE_EQ(
-      config.is_valid(), true,
+      config.is_valid(),
+      true,
      platform::errors::InvalidArgument(
          "Note: Each config can only be used for one predictor."));
@@ -238,7 +249,8 @@ bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name,
 std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
    const std::string &name) {
-  PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true,
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, true),
+                    true,
                    platform::errors::PreconditionNotMet(
                        "The in variable named %s is not found in the "
                        "ONNXPredictor.",
@@ -254,12 +266,21 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
  }
  res->SetOrtMark(true);
  res->SetOrtBinding(binding_);
+  auto iter = input_buffers_.find(name);
+  if (iter == input_buffers_.end()) {
+    std::vector<int8_t> i_vector;
+    input_buffers_[name] = std::make_shared<std::vector<int8_t>>(i_vector);
+    res->SetOrtBuffer(input_buffers_[name]);
+  } else {
+    res->SetOrtBuffer(iter->second);
+  }
  return res;
 }
 std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
    const std::string &name) {
-  PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true,
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, false),
+                    true,
                    platform::errors::PreconditionNotMet(
                        "The out variable named %s is not found in the "
                        "ONNXPredictor.",
@@ -296,8 +317,10 @@ bool ONNXRuntimePredictor::ZeroCopyRun() {
  try {
    const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
    for (auto output : output_desc_) {
-      Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator,
+      Ort::MemoryInfo out_memory_info(device_name,
-                                      place_.GetDeviceId(), OrtMemTypeDefault);
+                                      OrtDeviceAllocator,
+                                      place_.GetDeviceId(),
+                                      OrtMemTypeDefault);
      binding_->BindOutput(output.name.c_str(), out_memory_info);
    }
    session_.Run({}, *(binding_.get()));
@@ -330,8 +353,9 @@ const void *ONNXRuntimePredictor::GetDeviceContexts() const {
  paddle::platform::DeviceContextPool &pool =
      paddle::platform::DeviceContextPool::Instance();
  const auto &dev_ctxs = pool.device_contexts();
-  return &const_cast<std::map<
+  return &const_cast<
-      phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>> &>(
+      std::map<phi::Place,
+               std::shared_future<std::unique_ptr<phi::DeviceContext>>> &>(
      dev_ctxs);
 }

--- a/paddle/fluid/inference/api/onnxruntime_predictor.h
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -202,6 +202,7 @@ class ONNXRuntimePredictor : public PaddlePredictor {
  platform::Place place_;
  std::vector<ONNXDesc> input_desc_;
  std::vector<ONNXDesc> output_desc_;
+  std::map<std::string, std::shared_ptr<std::vector<int8_t>>> input_buffers_;
  int predictor_id_;
 // Some more detailed tests, they are made the friends of the predictor, so that

--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -110,7 +110,8 @@ class PD_INFER_DECL Tensor {
  /// \param place The place of data.
  /// \param layout The layout of data. Only NCHW is supported now.
  template <typename T>
-  void ShareExternalData(const T* data, const std::vector<int>& shape,
+  void ShareExternalData(const T* data,
+                         const std::vector<int>& shape,
                         PlaceType place,
                         DataLayout layout = DataLayout::kNCHW);
@@ -171,7 +172,9 @@ class PD_INFER_DECL Tensor {
  void SetName(const std::string& name);
  template <typename T>
-  void CopyToCpuImpl(T* data, void* stream = nullptr, CallbackFunc cb = nullptr,
+  void CopyToCpuImpl(T* data,
+                     void* stream = nullptr,
+                     CallbackFunc cb = nullptr,
                     void* cb_params = nullptr) const;
  std::string name_;
@@ -188,7 +191,7 @@ class PD_INFER_DECL Tensor {
 #ifdef PADDLE_WITH_ONNXRUNTIME
  bool is_ort_tensor_{false};
  std::vector<int64_t> shape_;
-  std::vector<int8_t> buffer_;
+  std::weak_ptr<std::vector<int8_t>> buffer_;
  std::weak_ptr<Ort::IoBinding> binding_;
  int idx_{-1};
@@ -196,6 +199,8 @@ class PD_INFER_DECL Tensor {
  void SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding);
+  void SetOrtBuffer(const std::shared_ptr<std::vector<int8_t>> buffer);
  template <typename T>
  void ORTCopyFromCpu(const T* data);

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -384,12 +384,12 @@ if(WITH_PYTHON)
        set(PADDLE2ONNX_PYBIND_OUT
            ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.dylib)
        set(ONNXRUNTIME_PYBIND_OUT
-            ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.dylib)
+            ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.1.10.0.dylib)
      else()
        set(PADDLE2ONNX_PYBIND_OUT
            ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.so)
        set(ONNXRUNTIME_PYBIND_OUT
-            ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so)
+            ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so.1.10.0)
      endif()
      add_custom_command(