[Paddle Inference] Predictor support paddle::Tensor (#50445)

10fd4a95 · Yuanle Liu · GitHub · 259b0aad · 10fd4a95 · 10fd4a95
14 changed file
--- a/cmake/phi_header.cmake
+++ b/cmake/phi_header.cmake
@@ -17,24 +17,21 @@ set(PADDLE_INFERENCE_INSTALL_DIR
 function(phi_header_path_compat TARGET_PATH)
  message(STATUS "phi header path compat processing: ${TARGET_PATH}")
-  string(FIND ${TARGET_PATH} "experimental" pos)
+  file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
-  if(pos GREATER 1)
+  foreach(header ${HEADERS})
-    file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
+    if(${header} MATCHES ".*.h$")
-    foreach(header ${HEADERS})
+      file(READ ${header} HEADER_CONTENT)
-      if(${header} MATCHES ".*.h$")
+      string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/"
-        file(READ ${header} HEADER_CONTENT)
+                     HEADER_CONTENT "${HEADER_CONTENT}")
-        string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/"
+      string(REPLACE "paddle/fluid/platform/"
-                       HEADER_CONTENT "${HEADER_CONTENT}")
+                     "paddle/include/experimental/phi/" HEADER_CONTENT
-        string(REPLACE "paddle/fluid/platform/"
+                     "${HEADER_CONTENT}")
-                       "paddle/include/experimental/phi/" HEADER_CONTENT
+      string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/"
-                       "${HEADER_CONTENT}")
+                     HEADER_CONTENT "${HEADER_CONTENT}")
-        string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/"
+      file(WRITE ${header} "${HEADER_CONTENT}")
-                       HEADER_CONTENT "${HEADER_CONTENT}")
+      message(STATUS "phi header path compat processing complete: ${header}")
-        file(WRITE ${header} "${HEADER_CONTENT}")
+    endif()
-        message(STATUS "phi header path compat processing complete: ${header}")
+  endforeach()
-      endif()
-    endforeach()
-  endif()
 endfunction()
 phi_header_path_compat(
@@ -51,6 +48,7 @@ phi_header_path_compat(
  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
 phi_header_path_compat(
  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/)
 # In order to be compatible with the original behavior, the header file name needs to be changed
 file(RENAME

--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -95,7 +95,7 @@ phi::DenseTensor& GetVariableTensor(const Scope& scope,
  PADDLE_ENFORCE_EQ(var->IsType<phi::DenseTensor>(),
                    true,
                    platform::errors::InvalidArgument(
-                        "Only support lod tensor in GetVariableTensor now."));
+                        "Only support DenseTensor in GetVariableTensor now."));
  return *var->GetMutable<phi::DenseTensor>();
 }

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -155,11 +155,10 @@ phi::Backend ConvertBackend(paddle_infer::PlaceType backend) {
      return phi::Backend::CPU;
  }
 }
-}  // namespace
-bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
+bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
-                             phi::DenseTensor *t,
+                               phi::DenseTensor *t,
-                             const platform::Place &place) {
+                               const platform::Place &place) {
  framework::DDim ddim = phi::make_ddim(pt.shape);
  void *input_ptr;
  if (pt.dtype == PaddleDType::INT64) {
@@ -270,6 +269,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
  t->set_lod(lod);
  return true;
 }
+}  // namespace
 bool AnalysisPredictor::Init(
    const std::shared_ptr<framework::Scope> &parent_scope,
@@ -919,6 +919,17 @@ void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
 #endif
 }
+void AnalysisPredictor::MkldnnPreSet(
+    const std::vector<paddle::Tensor> &inputs) {
+#ifdef PADDLE_WITH_MKLDNN
+  std::vector<std::vector<int>> inputs_shape;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs_shape.emplace_back(phi::vectorize<int>(inputs[i].dims()));
+  }
+  MkldnnPreSet(inputs_shape);
+#endif
+}
 void AnalysisPredictor::MkldnnPreSet(
    const std::vector<std::vector<int>> &inputs_shape) {
 #ifdef PADDLE_WITH_MKLDNN
@@ -1033,6 +1044,70 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
  return true;
 }
+bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
+                            std::vector<paddle::Tensor> *outputs) {
+  inference::DisplayMemoryInfo(place_, "before run");
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPreSet(inputs);
+#endif
+  VLOG(3) << "predict start";
+  // set feed variable
+  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::PreconditionNotMet("The scope should not be nullptr."));
+  if (!SetFeed(inputs, scope)) {
+    LOG(ERROR) << "fail to set feed";
+    return false;
+  }
+#ifdef PADDLE_WITH_TENSORRT
+  if (config_.tensorrt_engine_enabled()) {
+    inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
+        predictor_id_;
+    VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: "
+            << inference::tensorrt::TensorRTEngine::predictor_id_per_thread;
+  }
+#endif
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->Run();
+  inference::DisplayMemoryInfo(place_, "after run");
+  // get fetch variable
+  if (!GetFetch(outputs, scope)) {
+    LOG(ERROR) << "fail to get fetches";
+    return false;
+  }
+  // All the containers in the scope will be hold in inference, but the
+  // operators assume that the container will be reset after each batch.
+  // Here is a bugfix, collect all the container variables, and reset then to a
+  // bool; the next time, the operator will call MutableData and construct a new
+  // container again, so that the container will be empty for each batch.
+  if (sub_scope_) {
+    tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
+  }
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
+  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
+  // conflict when integrating it into deployment service.
+  paddle::platform::SetNumThreads(1);
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPostReset();
+#endif
+#if defined(PADDLE_WITH_MKLML)
+  // Frees unused memory allocated by the Intel® MKL Memory Allocator to
+  // avoid memory leak. See:
+  // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
+  platform::dynload::MKL_Free_Buffers();
+#endif
+  return true;
+}
 bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                framework::Scope *scope) {
  VLOG(3) << "Predictor::set_feed";
@@ -1047,7 +1122,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
  for (size_t i = 0; i < inputs.size(); ++i) {
    phi::DenseTensor *input = &feed_tensors_[i];
-    if (!PaddleTensorToLoDTensor(inputs[i], input, place_)) {
+    if (!PaddleTensorToDenseTensor(inputs[i], input, place_)) {
      return false;
    }
    int idx = -1;
@@ -1061,7 +1136,41 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
    } else {
      idx = PADDLE_GET_CONST(int, feeds_[i]->GetAttr("col"));
    }
-    framework::SetFeedVariable(scope, *input, "feed", idx);
+    framework::SetFeedVariable(scope, *input, framework::kFeedOpType, idx);
+  }
+  return true;
+}
+bool AnalysisPredictor::SetFeed(const std::vector<paddle::Tensor> &inputs,
+                                framework::Scope *scope) {
+  VLOG(3) << "Predictor::set_feed";
+  PADDLE_ENFORCE_EQ(inputs.size(),
+                    feeds_.size(),
+                    platform::errors::InvalidArgument(
+                        "wrong feed input size, need %d but get %d.",
+                        feeds_.size(),
+                        inputs.size()));
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    PADDLE_ENFORCE_EQ(inputs[i].initialized(),
+                      true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The input Tensor expected to be initialized."));
+  }
+  if (std::all_of(inputs.cbegin(), inputs.cend(), [&](const paddle::Tensor &t) {
+        return !t.name().empty() && feed_names_.count(t.name());
+      })) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto &t = framework::GetVariableTensor(*scope, inputs[i].name());
+      t.ShareDataWith(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl()));
+    }
+  } else {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto &t = framework::GetVariableTensor(*scope, idx2feeds_[i]);
+      t.ShareDataWith(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl()));
+    }
  }
  return true;
 }
@@ -1100,7 +1209,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
            idx,
            i));
    framework::FetchType &fetch_var =
-        framework::GetFetchVariable(*scope, "fetch", idx);
+        framework::GetFetchVariable(*scope, framework::kFetchOpType, idx);
    auto &fetch = PADDLE_GET(phi::DenseTensor, fetch_var);
    auto type = framework::TransToProtoVarType(fetch.dtype());
    auto output = &(outputs->at(i));
@@ -1125,6 +1234,19 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
  return true;
 }
+bool AnalysisPredictor::GetFetch(std::vector<paddle::Tensor> *outputs,
+                                 framework::Scope *scope) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetches_.size());
+  for (size_t i = 0; i < fetches_.size(); ++i) {
+    auto const &name = idx2fetches_[i];
+    auto &t = framework::GetVariableTensor(*scope, name);
+    (*outputs)[i] =
+        std::move(paddle::Tensor(std::make_shared<phi::DenseTensor>(t), name));
+  }
+  return true;
+}
 void AnalysisPredictor::PrepareArgument() {
  VLOG(3) << "AnalysisPredictor::PrepareArgument";
  // Init std::unique_ptr argument_.
@@ -1579,7 +1701,7 @@ void AnalysisPredictor::PrepareFeedFetch() {
                              "The sub_scope should not be nullptr."));
  CreateFeedFetchVar(sub_scope_);
  for (auto *op : inference_program_->Block(0).AllOps()) {
-    if (op->Type() == "feed") {
+    if (op->Type() == framework::kFeedOpType) {
      int idx = PADDLE_GET_CONST(int, op->GetAttr("col"));
      if (feeds_.size() <= static_cast<size_t>(idx)) {
        feeds_.resize(idx + 1);
@@ -1587,7 +1709,7 @@ void AnalysisPredictor::PrepareFeedFetch() {
      feeds_[idx] = op;
      feed_names_[op->Output("Out")[0]] = idx;
      idx2feeds_[idx] = op->Output("Out")[0];
-    } else if (op->Type() == "fetch") {
+    } else if (op->Type() == framework::kFetchOpType) {
      int idx = PADDLE_GET_CONST(int, op->GetAttr("col"));
      if (fetches_.size() <= static_cast<size_t>(idx)) {
        fetches_.resize(idx + 1);
@@ -1602,9 +1724,9 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
  PADDLE_ENFORCE_NOT_NULL(
      scope,
      platform::errors::InvalidArgument("The scope should not be nullptr."));
-  auto *var = scope->Var("feed");
+  auto *var = scope->Var(framework::kFeedOpType);
  var->GetMutable<framework::FeedList>();
-  var = scope->Var("fetch");
+  var = scope->Var(framework::kFetchOpType);
  var->GetMutable<framework::FetchList>();
 }
@@ -2186,7 +2308,7 @@ void AnalysisPredictor::ClearIntermediateTensor() {
      const std::string name = var->Name();
      auto *variable = executor_->GetScope()->FindVar(name);
      if (variable != nullptr && variable->IsType<phi::DenseTensor>() &&
-          name != "feed" && name != "fetch") {
+          name != framework::kFeedOpType && name != framework::kFetchOpType) {
        VLOG(3) << "Clear Intermediate Tensor: " << name;
        auto *t = variable->GetMutable<phi::DenseTensor>();
        t->clear();
@@ -2653,6 +2775,11 @@ std::map<std::string, DataType> Predictor::GetOutputTypes() {
 bool Predictor::Run() { return predictor_->ZeroCopyRun(); }
+bool Predictor::Run(const std::vector<paddle::Tensor> &inputs,
+                    std::vector<paddle::Tensor> *outputs) {
+  return predictor_->Run(inputs, outputs);
+}
 std::unique_ptr<Predictor> Predictor::Clone(void *stream) {
  auto analysis_pred = predictor_->Clone(stream);
  std::unique_ptr<Predictor> pred(new Predictor(std::move(analysis_pred)));

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -31,15 +31,14 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/resource_manager.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/phi/core/dense_tensor.h"
 #ifdef PADDLE_WITH_TESTING
 #include <gtest/gtest.h>
 #include <gtest/gtest_prod.h>
 #endif
 namespace paddle_infer {
-using float16 = paddle::platform::float16;
 namespace experimental {
 class InternalUtils;
 };
@@ -150,6 +149,16 @@ class AnalysisPredictor : public PaddlePredictor {
           std::vector<PaddleTensor> *output_data,
           int batch_size = -1) override;
+  ///
+  /// \brief Run the prediction engine (Recommended).
+  ///
+  /// \param[in] inputs input tensors
+  /// \param[out] outputs output tensors
+  /// \return Whether the function executed successfully
+  ///
+  bool Run(const std::vector<paddle::Tensor> &inputs,
+           std::vector<paddle::Tensor> *outputs) override;
  ///
  /// \brief Get the input names
  ///
@@ -378,6 +387,17 @@ class AnalysisPredictor : public PaddlePredictor {
  ///
  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
               framework::Scope *scope);
+  ///
+  /// \brief Prepare input data, only used in Run()
+  ///
+  /// \param[in] inputs inpute tensors
+  /// \param[in] scope the scope used by predictor
+  /// \return Whether the function executed successfully
+  ///
+  bool SetFeed(const std::vector<paddle::Tensor> &inputs,
+               framework::Scope *scope);
  ///
  /// \brief Get the output data, only used in Run()
  ///
@@ -387,6 +407,16 @@ class AnalysisPredictor : public PaddlePredictor {
  ///
  bool GetFetch(std::vector<PaddleTensor> *output_data,
                framework::Scope *scope);
+  ///
+  /// \brief Get the output data, only used in Run()
+  ///
+  /// \param[out] outputs output tensors
+  /// \param[in] scope the scope used by predictor
+  /// \return Whether the function executed successfully
+  ///
+  bool GetFetch(std::vector<paddle::Tensor> *outputs, framework::Scope *scope);
  ///
  /// \brief Get the output data, only used in GetFetch()
  ///
@@ -404,6 +434,14 @@ class AnalysisPredictor : public PaddlePredictor {
  /// \param[in] inputs tensors
  ///
  void MkldnnPreSet(const std::vector<PaddleTensor> &inputs);
+  ///
+  /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.
+  ///
+  /// Used in AnalysisPredictor::Run().
+  ///
+  /// \param[in] inputs tensors
+  ///
+  void MkldnnPreSet(const std::vector<paddle::Tensor> &inputs);
  ///
  /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -83,7 +83,7 @@ else()
  if(WITH_MKL)
    set(FLAG_OPENMP "-fopenmp")
  endif()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${FLAG_OPENMP}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ${FLAG_OPENMP}")
 endif()
 if(WITH_GPU)

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -221,6 +221,16 @@ class PD_INFER_DECL PaddlePredictor {
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;
+  /// \brief This interface takes input and runs the network (Recommended).
+  /// \param[in] inputs An list of Tensor as the input to the network.
+  /// \param[out] output_data Pointer to the tensor list, which holds the output
+  /// Tensor
+  /// \return Whether the run is successful
+  virtual bool Run(const std::vector<paddle::Tensor>& inputs,
+                   std::vector<paddle::Tensor>* outputs) {
+    return false;
+  }
  /// \brief  Used to get the name of the network input.
  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
  /// \return Input tensor names.

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -128,6 +128,17 @@ class PD_INFER_DECL Predictor {
  ///
  bool Run();
+  ///
+  /// \brief Run the prediction engine (Recommended)
+  ///
+  /// \param[in] inputs An list of Tensor as the input to the network.
+  /// \param[out] outputs Pointer to the tensor list, which holds the output
+  /// Tensor
+  ///
+  /// \return Whether the run is successful
+  bool Run(const std::vector<paddle::Tensor>& inputs,
+           std::vector<paddle::Tensor>* outputs);
  ///
  /// \brief Get the output names
  ///

--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -21,6 +21,8 @@
 #include "paddle_infer_declare.h"  // NOLINT
+#include "paddle/phi/api/include/tensor.h"  // expose paddle::Tensor
 #ifdef PADDLE_WITH_ONNXRUNTIME
 #include "onnxruntime_c_api.h"    // NOLINT
 #include "onnxruntime_cxx_api.h"  // NOLINT

--- a/paddle/fluid/jit/engine/predictor_engine.cc
+++ b/paddle/fluid/jit/engine/predictor_engine.cc
@@ -22,11 +22,6 @@
 namespace paddle {
 namespace jit {
-static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t);
-static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
-                                      DenseTensor *t,
-                                      const platform::Place &place);
 PredictorEngine::PredictorEngine(
    const std::shared_ptr<FunctionInfo> &info,
    const std::shared_ptr<VariableMap> &params_dict,
@@ -52,6 +47,7 @@ PredictorEngine::PredictorEngine(
  config.SetSkipLoadParams(true);
  config.SetApplyOptim(true);
  config.SwitchIrOptim(true);
+  config.SwitchUseFeedFetchOps(false);
  predictor_.reset(new AnalysisPredictor(config));
@@ -78,135 +74,15 @@ std::unique_ptr<BaseEngine> PredictorEngine::Clone(void *stream) {
 std::vector<Tensor> PredictorEngine::operator()(
    const std::vector<Tensor> &inputs) {
-  auto dense_tensors = utils::ToDenseTensors(inputs);
+  std::vector<Tensor> outputs;
-  return utils::ToTensors(this->operator()(dense_tensors));
+  predictor_->Run(inputs, &outputs);
-}
-std::vector<DenseTensor> PredictorEngine::operator()(
-    const std::vector<DenseTensor> &inputs) {
-  std::vector<PaddleTensor> pt_inputs;
-  std::vector<PaddleTensor> pt_outputs;
-  for (auto &t : inputs) {
-    auto non_const_t = const_cast<DenseTensor *>(&t);
-    pt_inputs.emplace_back(DenseTensorToPaddleTensor(non_const_t));
-  }
-  predictor_->Run(pt_inputs, &pt_outputs);
-  std::vector<DenseTensor> outputs;
-  for (auto &pt : pt_outputs) {
-    DenseTensor t;
-    PaddleTensorToDenseTensor(pt, &t, place_);
-    outputs.emplace_back(t);
-  }
  return outputs;
 }
-static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t) {
+std::vector<DenseTensor> PredictorEngine::operator()(
-  PaddleTensor pt;
+    const std::vector<DenseTensor> &inputs) {
-  switch (framework::TransToProtoVarType(t->dtype())) {
+  return utils::ToDenseTensors(this->operator()(utils::ToTensors(inputs)));
-    case framework::proto::VarType::INT32: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(int32_t));
-      pt.dtype = PaddleDType::INT32;
-    } break;
-    case framework::proto::VarType::INT64: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(int64_t));
-      pt.dtype = PaddleDType::INT64;
-    } break;
-    case framework::proto::VarType::FP32: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(float));
-      pt.dtype = PaddleDType::FLOAT32;
-    } break;
-    default:
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Unsupported tensor date type. Now "
-                                          "only supports INT64, FP32, INT32."));
-  }
-  pt.shape = phi::vectorize<int>(t->dims());
-  return pt;
-}
-static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
-                                      DenseTensor *t,
-                                      const platform::Place &place) {
-  framework::DDim ddim = phi::make_ddim(pt.shape);
-  void *input_ptr;
-  switch (pt.dtype) {
-    case PaddleDType::INT64:
-      input_ptr = t->mutable_data<int64_t>(ddim, place);
-      break;
-    case PaddleDType::FLOAT32:
-      input_ptr = t->mutable_data<float>(ddim, place);
-      break;
-    case PaddleDType::INT32:
-      input_ptr = t->mutable_data<int32_t>(ddim, place);
-      break;
-    case PaddleDType::FLOAT16:
-      input_ptr = t->mutable_data<float16>(ddim, place);
-      break;
-    default:
-      LOG(ERROR) << "unsupported feed type " << pt.dtype;
-      return false;
-  }
-  PADDLE_ENFORCE_NOT_NULL(
-      input_ptr,
-      paddle::platform::errors::Fatal(
-          "Cannot convert to LoDTensor because LoDTensor creation failed."));
-  PADDLE_ENFORCE_NOT_NULL(
-      pt.data.data(),
-      paddle::platform::errors::InvalidArgument(
-          "The data contained in the input PaddleTensor is illegal."));
-  if (platform::is_cpu_place(place)) {
-    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(
-        static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
-  } else if (platform::is_ipu_place(place)) {
-#ifdef PADDLE_WITH_IPU
-    std::memcpy(
-        static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with WITH_IPU, should not reach here."));
-#endif
-  } else if (platform::is_gpu_place(place)) {
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place),
-                      false,
-                      platform::errors::InvalidArgument(
-                          "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
-    auto dst_gpu_place = place;
-    memory::Copy(dst_gpu_place,
-                 static_cast<void *>(input_ptr),
-                 platform::CPUPlace(),
-                 pt.data.data(),
-                 pt.data.length(),
-                 dev_ctx->stream());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with CUDA, should not reach here."));
-#endif
-  } else if (platform::is_xpu_place(place)) {
-#ifdef PADDLE_WITH_XPU
-    auto dst_xpu_place = place;
-    memory::Copy(dst_xpu_place,
-                 static_cast<void *>(input_ptr),
-                 platform::CPUPlace(),
-                 pt.data.data(),
-                 pt.data.length());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with XPU, should not reach here."));
-#endif
-  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "The analysis predictor supports CPU, GPU and XPU now."));
-  }
-  return true;
 }
 }  // namespace jit

--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -770,7 +770,11 @@ PyObject* ToPyObject(const std::vector<std::vector<size_t>>& value) {
 PyObject* ToPyObject(const std::vector<paddle::Tensor>& value,
                     bool return_py_none_if_not_initialize) {
+  // NOTE(liuyuanle): I encountered a bug(access violation) in windows. ref to
+  // https://stackoverflow.com/questions/55598839/how-to-fix-access-violation-error-when-returning-pyobject-from-c-function-usin
+  PyGILState_STATE gstate = PyGILState_Ensure();
  PyObject* result = PyList_New((Py_ssize_t)value.size());
+  PyGILState_Release(gstate);
  for (size_t i = 0; i < value.size(); i++) {
    if (!value[i].initialized() && return_py_none_if_not_initialize) {

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -65,7 +65,7 @@ constexpr int NPY_UINT16_ = 4;
 // paddle::platform::float16 as numpy.float16.
 // Ref: https://github.com/pybind/pybind11/issues/1776
 template <>
-struct npy_format_descriptor<paddle_infer::float16> {
+struct npy_format_descriptor<phi::dtype::float16> {
  static py::dtype dtype() {
    handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_FLOAT16_);
    return reinterpret_borrow<py::dtype>(ptr);
@@ -180,7 +180,7 @@ py::dtype PaddleDTypeToNumpyDType(PaddleDType dtype) {
      dt = py::dtype::of<float>();
      break;
    case PaddleDType::FLOAT16:
-      dt = py::dtype::of<paddle_infer::float16>();
+      dt = py::dtype::of<phi::dtype::float16>();
      break;
    case PaddleDType::UINT8:
      dt = py::dtype::of<uint8_t>();
@@ -264,7 +264,7 @@ void PaddleInferShareExternalData(paddle_infer::Tensor &tensor,  // NOLINT
        ToPaddleInferPlace(input_tensor.place().GetType()));
  } else if (input_tensor.dtype() == phi::DataType::FLOAT16) {
    tensor.ShareExternalData(
-        static_cast<paddle::platform::float16 *>(input_tensor.data()),
+        static_cast<phi::dtype::float16 *>(input_tensor.data()),
        shape,
        ToPaddleInferPlace(input_tensor.place().GetType()));
  } else if (input_tensor.dtype() == phi::DataType::INT32) {
@@ -353,7 +353,7 @@ size_t PaddleGetDTypeSize(PaddleDType dt) {
      size = sizeof(float);
      break;
    case PaddleDType::FLOAT16:
-      size = sizeof(paddle_infer::float16);
+      size = sizeof(phi::dtype::float16);
      break;
    case PaddleDType::INT8:
      size = sizeof(int8_t);
@@ -392,8 +392,8 @@ py::array ZeroCopyTensorToNumpy(ZeroCopyTensor &tensor) {  // NOLINT
      tensor.copy_to_cpu<float>(static_cast<float *>(array.mutable_data()));
      break;
    case PaddleDType::FLOAT16:
-      tensor.copy_to_cpu<paddle::platform::float16>(
+      tensor.copy_to_cpu<phi::dtype::float16>(
-          static_cast<paddle::platform::float16 *>(array.mutable_data()));
+          static_cast<phi::dtype::float16 *>(array.mutable_data()));
      break;
    case PaddleDType::UINT8:
      tensor.copy_to_cpu<uint8_t>(static_cast<uint8_t *>(array.mutable_data()));
@@ -432,8 +432,8 @@ py::array PaddleInferTensorToNumpy(paddle_infer::Tensor &tensor) {  // NOLINT
      tensor.CopyToCpu<float>(static_cast<float *>(array.mutable_data()));
      break;
    case PaddleDType::FLOAT16:
-      tensor.CopyToCpu<paddle::platform::float16>(
+      tensor.CopyToCpu<phi::dtype::float16>(
-          static_cast<paddle::platform::float16 *>(array.mutable_data()));
+          static_cast<phi::dtype::float16 *>(array.mutable_data()));
      break;
    case PaddleDType::UINT8:
      tensor.CopyToCpu(static_cast<uint8_t *>(array.mutable_data()));
@@ -1062,6 +1062,16 @@ void BindPaddleInferPredictor(py::module *m) {
      .def("get_output_names", &paddle_infer::Predictor::GetOutputNames)
      .def("get_input_handle", &paddle_infer::Predictor::GetInputHandle)
      .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
+      .def(
+          "run",
+          [](paddle_infer::Predictor &self, py::handle py_in_tensor_list) {
+            auto in_tensor_list =
+                CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+            std::vector<paddle::Tensor> outputs;
+            self.Run(in_tensor_list, &outputs);
+            return py::handle(ToPyObject(outputs));
+          },
+          py::arg("inputs"))
      .def("run", [](paddle_infer::Predictor &self) { self.Run(); })
      .def("clone",
           [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
@@ -1091,9 +1101,9 @@ void BindZeroCopyTensor(py::module *m) {
      .def("copy_from_cpu", &ZeroCopyTensorCreate<int32_t>)
      .def("copy_from_cpu", &ZeroCopyTensorCreate<int64_t>)
      .def("copy_from_cpu", &ZeroCopyTensorCreate<float>)
+      .def("copy_from_cpu", &ZeroCopyTensorCreate<phi::dtype::float16>)
      // NOTE(liuyuanle): double must be bound after float.
      .def("copy_from_cpu", &ZeroCopyTensorCreate<double>)
-      .def("copy_from_cpu", &ZeroCopyTensorCreate<paddle_infer::float16>)
      .def("copy_from_cpu", &ZeroCopyTensorCreate<bool>)
      .def("copy_from_cpu", &ZeroCopyStringTensorCreate)
      .def("copy_to_cpu", &ZeroCopyTensorToNumpy)
@@ -1116,10 +1126,9 @@ void BindPaddleInferTensor(py::module *m) {
      .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<int32_t>)
      .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<int64_t>)
      .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<float>)
+      .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<phi::dtype::float16>)
      // NOTE(liuyuanle): double must be bound after float.
      .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<double>)
-      .def("_copy_from_cpu_bind",
-           &PaddleInferTensorCreate<paddle_infer::float16>)
      .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<bool>)
      .def("_copy_from_cpu_bind", &PaddleInferStringTensorCreate)
      .def("_share_external_data_bind", &PaddleInferShareExternalData)

--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -416,7 +416,7 @@ class PADDLE_API Tensor final {
  /**
   * @brief Return the name of Tensor.
   * @note Used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
   *
   * @return const std::string&
   */
@@ -425,7 +425,7 @@ class PADDLE_API Tensor final {
  /**
   * @brief Set name of Tensor.
   * @note Used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
   *
   * @param const std::string& name
   */
@@ -657,7 +657,7 @@ class PADDLE_API Tensor final {
  /**
   * Tensor name: used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
   */
  std::string name_{""};

--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -136,6 +136,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) {
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
  Tensor out;
  copy(x, place, blocking, &out);
+  out.set_name(x.name());
  return out;
 }

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import unittest
+import numpy as np
+import paddle
+from paddle.inference import Config, create_predictor
+class TestNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = paddle.nn.Linear(4, 4)
+        self.fc2 = paddle.nn.Linear(4, 4)
+    def forward(self, x1, x2):
+        y1 = self.fc1(x1)
+        y2 = self.fc2(x2)
+        return y1 + y2
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda(), 'should compile with cuda.'
+)
+class TestPredictorRunWithTensor(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        net = TestNet()
+        model = paddle.jit.to_static(
+            net,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 4], dtype='float32', name='input0'
+                ),
+                paddle.static.InputSpec(
+                    shape=[None, 4], dtype='float32', name='input1'
+                ),
+            ],
+        )
+        paddle.jit.save(
+            model,
+            os.path.join(
+                self.temp_dir.name, 'test_predictor_run_model/inference'
+            ),
+        )
+    def tearDown(self):
+        self.temp_dir.cleanup()
+    def init_predictor(self):
+        config = Config(
+            os.path.join(
+                self.temp_dir.name,
+                'test_predictor_run_model/inference.pdmodel',
+            ),
+            os.path.join(
+                self.temp_dir.name,
+                'test_predictor_run_model/inference.pdiparams',
+            ),
+        )
+        config.enable_use_gpu(256, 0)
+        config.enable_memory_optim()
+        predictor = create_predictor(config)
+        return predictor
+    def get_inputs(self):
+        input0 = np.array([[1, 2, 3, 4], [2, 3, 4, 5]]).astype(np.float32)
+        input1 = np.array([[0.1, 0.2, 0.3, 0.4], [1.2, 1.3, 1.4, 1.5]]).astype(
+            np.float32
+        )
+        input0_tensor = paddle.to_tensor(input0)
+        input1_tensor = paddle.to_tensor(input1)
+        return [input0_tensor, input1_tensor]
+    def get_disorder_output(self):
+        predictor = self.init_predictor()
+        [input0_tensor, input1_tensor] = self.get_inputs()
+        input_names = predictor.get_input_names()
+        input0_tensor.name = input_names[0]
+        input1_tensor.name = input_names[1]
+        # disorder
+        inputs = [input1_tensor, input0_tensor]
+        outputs = predictor.run(inputs)
+        return outputs[0]
+    def get_inorder_output(self):
+        predictor = self.init_predictor()
+        [input0_tensor, input1_tensor] = self.get_inputs()
+        # inorder
+        inputs = [input0_tensor, input1_tensor]
+        outputs = predictor.run(inputs)
+        return outputs[0]
+    def test_output(self):
+        inorder_output = self.get_inorder_output()
+        disorder_output = self.get_disorder_output()
+        assert np.allclose(
+            inorder_output.numpy().flatten(), disorder_output.numpy().flatten()
+        )
+if __name__ == '__main__':
+    unittest.main()