[Paddle Inference]Support PaddlePaddle Backend on Triton (#49758)

* support PaddlePaddle Backend on Triton * fix test cases * fix Codestyle * add test case * add test case

[Paddle Inference]Support PaddlePaddle Backend on Triton (#49758)
* support PaddlePaddle Backend on Triton * fix test cases * fix Codestyle * add test case * add test case
e3f39833 · heliqi · GitHub · a1b2e1e2 · e3f39833 · e3f39833
18 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1609,6 +1609,51 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
  return output_names;
 }
+std::map<std::string, std::vector<int64_t>>
+AnalysisPredictor::GetOutputTensorShape() {
+  std::map<std::string, std::vector<int64_t>> output_shapes;
+  std::vector<std::string> names = GetOutputNames();
+  for (std::string name : names) {
+    auto *var = inference_program_->Block(0).FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var,
+                            platform::errors::PreconditionNotMet(
+                                "Output %s does not exist.", name));
+    output_shapes[name] = var->GetShape();
+  }
+  return output_shapes;
+}
+std::map<std::string, paddle_infer::DataType>
+AnalysisPredictor::GetOutputTypes() {
+  std::map<std::string, paddle_infer::DataType> output_type;
+  std::vector<std::string> names = GetOutputNames();
+  for (const auto &name : names) {
+    auto *var = inference_program_->Block(0).FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::PreconditionNotMet(
+            "Output %s does not exist inference_program_.", name));
+    auto dtype = var->GetDataType();
+    if (dtype == paddle::framework::proto::VarType::FP32) {
+      output_type[name] = paddle_infer::DataType::FLOAT32;
+    } else if (dtype == paddle::framework::proto::VarType::FP16) {
+      output_type[name] = paddle_infer::DataType::FLOAT16;
+    } else if (dtype == paddle::framework::proto::VarType::INT64) {
+      output_type[name] = paddle_infer::DataType::INT64;
+    } else if (dtype == paddle::framework::proto::VarType::INT32) {
+      output_type[name] = paddle_infer::DataType::INT32;
+    } else if (dtype == paddle::framework::proto::VarType::UINT8) {
+      output_type[name] = paddle_infer::DataType::UINT8;
+    } else if (dtype == paddle::framework::proto::VarType::INT8) {
+      output_type[name] = paddle_infer::DataType::INT8;
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported data type `%s` when get output dtype ", dtype));
+    }
+  }
+  return output_type;
+}
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
    const std::string &name) {
  framework::Scope *scope;
@@ -2477,6 +2522,10 @@ std::vector<std::string> Predictor::GetInputNames() {
  return predictor_->GetInputNames();
 }
+std::map<std::string, std::vector<int64_t>> Predictor::GetInputTensorShape() {
+  return predictor_->GetInputTensorShape();
+}
 std::map<std::string, DataType> Predictor::GetInputTypes() {
  return predictor_->GetInputTypes();
 }
@@ -2493,6 +2542,14 @@ std::unique_ptr<Tensor> Predictor::GetOutputHandle(const std::string &name) {
  return predictor_->GetOutputTensor(name);
 }
+std::map<std::string, std::vector<int64_t>> Predictor::GetOutputTensorShape() {
+  return predictor_->GetOutputTensorShape();
+}
+std::map<std::string, DataType> Predictor::GetOutputTypes() {
+  return predictor_->GetOutputTypes();
+}
 bool Predictor::Run() { return predictor_->ZeroCopyRun(); }
 std::unique_ptr<Predictor> Predictor::Clone(void *stream) {

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -191,6 +191,18 @@ class AnalysisPredictor : public PaddlePredictor {
  /// \return the map of input names and type
  ///
  std::map<std::string, paddle_infer::DataType> GetInputTypes() override;
+  ///
+  /// \brief Get all output names and their corresponding shapes
+  ///
+  /// \return the map of output names and shapes
+  ///
+  std::map<std::string, std::vector<int64_t>> GetOutputTensorShape() override;
+  ///
+  /// \brief Get all output names and their corresponding type
+  ///
+  /// \return the map of output names and type
+  ///
+  std::map<std::string, paddle_infer::DataType> GetOutputTypes() override;
  ///
  /// \brief Run the prediction engine

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -106,6 +106,8 @@ TEST(AnalysisPredictor, analysis_on) {
  ASSERT_EQ(predictor->scope_->parent(), nullptr);
  ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
  ASSERT_EQ(predictor->GetInputTypes().size(), 4UL);
+  ASSERT_EQ(predictor->GetOutputTypes().size(), 1UL);
+  ASSERT_EQ(predictor->GetOutputTensorShape().size(), 1UL);
  // 2. Dummy Input Data
  int64_t data[4] = {1, 2, 3, 4};
  PaddleTensor tensor;
@@ -430,6 +432,8 @@ TEST(Predictor, Run) {
  auto predictor = CreatePredictor(config);
  ASSERT_EQ(predictor->GetInputTypes().size(), 4UL);
+  ASSERT_EQ(predictor->GetOutputTypes().size(), 1UL);
+  ASSERT_EQ(predictor->GetOutputTensorShape().size(), 1UL);
  auto w0 = predictor->GetInputHandle("firstw");
  auto w1 = predictor->GetInputHandle("secondw");

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -243,6 +243,19 @@ class PD_INFER_DECL PaddlePredictor {
  /// \return Output tensor names.
  virtual std::vector<std::string> GetOutputNames() { return {}; }
+  /// \brief Get the output shape of the model.
+  /// \return A map contains all the output names and shape defined in the
+  /// model.
+  virtual std::map<std::string, std::vector<int64_t>> GetOutputTensorShape() {
+    return {};
+  }
+  /// \brief Get the output type of the model.
+  /// \return A map contains all the output names and type defined in the model.
+  virtual std::map<std::string, paddle_infer::DataType> GetOutputTypes() {
+    return {};
+  }
  /// \brief Get the input ZeroCopyTensor by name.
  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
  /// The name is obtained from the GetInputNames() interface.

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -92,6 +92,13 @@ class PD_INFER_DECL Predictor {
  ///
  explicit Predictor(const Config& config);
+  ///
+  /// \brief Get all input names and their corresponding shapes
+  ///
+  /// \return the map of input names and shape
+  ///
+  std::map<std::string, std::vector<int64_t>> GetInputTensorShape();
  ///
  /// \brief Get all input names and their corresponding type
  ///
@@ -136,6 +143,20 @@ class PD_INFER_DECL Predictor {
  ///
  std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
+  ///
+  /// \brief Get all output names and their corresponding shapes
+  ///
+  /// \return the map of output names and shape
+  ///
+  std::map<std::string, std::vector<int64_t>> GetOutputTensorShape();
+  ///
+  /// \brief Get all output names and their corresponding type
+  ///
+  /// \return the map of output names and type
+  ///
+  std::map<std::string, DataType> GetOutputTypes();
  ///
  /// \brief Clone to get the new predictor. thread safe.
  ///

--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -55,8 +55,9 @@ __pd_give PD_Config* PD_ConfigCreate() {
 }
 void PD_ConfigDestroy(__pd_take PD_Config* pd_config) {
-  CHECK_AND_CONVERT_PD_CONFIG;
+  if (pd_config != NULL) {
-  delete reinterpret_cast<Config*>(config);
+    delete reinterpret_cast<Config*>(pd_config);
+  }
 }
 void PD_ConfigSetModel(__pd_keep PD_Config* pd_config,
@@ -116,9 +117,12 @@ PD_Bool PD_ConfigUseFcPadding(__pd_keep PD_Config* pd_config) {
 void PD_ConfigEnableUseGpu(__pd_keep PD_Config* pd_config,
                           uint64_t memory_pool_init_size_mb,
-                           int32_t device_id) {
+                           int32_t device_id,
+                           PD_PrecisionType precision_mode) {
  CHECK_AND_CONVERT_PD_CONFIG;
-  config->EnableUseGpu(memory_pool_init_size_mb, device_id);
+  config->EnableUseGpu(memory_pool_init_size_mb,
+                       device_id,
+                       ConvertToCxxPrecisionType(precision_mode));
 }
 void PD_ConfigDisableGpu(__pd_keep PD_Config* pd_config) {
  CHECK_AND_CONVERT_PD_CONFIG;
@@ -427,6 +431,14 @@ void PD_ConfigSetBfloat16Op(__pd_keep PD_Config* pd_config,
  }
  config->SetBfloat16Op(std::move(op_names));
 }
+void PD_ConfigEnableMkldnnInt8(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMkldnnInt8();
+}
+PD_Bool PD_ConfigMkldnnInt8Enabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_int8_enabled();
+}
 PD_Bool PD_ConfigThreadLocalStreamEnabled(__pd_keep PD_Config* pd_config) {
  CHECK_AND_CONVERT_PD_CONFIG;
  return config->thread_local_stream_enabled();
@@ -484,6 +496,10 @@ void PD_ConfigEnableGpuMultiStream(__pd_keep PD_Config* pd_config) {
  CHECK_AND_CONVERT_PD_CONFIG;
  config->EnableGpuMultiStream();
 }
+void PD_ConfigSetExecStream(__pd_keep PD_Config* pd_config, void* stream) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->SetExecStream(stream);
+}
 void PD_ConfigPartiallyRelease(__pd_keep PD_Config* pd_config) {
  CHECK_AND_CONVERT_PD_CONFIG;
  config->PartiallyRelease();

--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -132,11 +132,13 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseFcPadding(
 /// \param[in] memory_pool_init_size_mb initial size of the GPU memory pool in
 /// MB.
 /// \param[in] device_id device_id the GPU card to use.
+/// \param[in] precision_mode the precision used in Paddle-GPU inference.
 ///
 PADDLE_CAPI_EXPORT extern void PD_ConfigEnableUseGpu(
    __pd_keep PD_Config* pd_config,
    uint64_t memory_pool_init_size_mb,
-    int32_t device_id);
+    int32_t device_id,
+    PD_PrecisionType precision_mode);
 ///
 /// \brief Turn off GPU.
 ///
@@ -607,6 +609,22 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnBfloat16Enabled(
 PADDLE_CAPI_EXPORT extern void PD_ConfigSetBfloat16Op(
    __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list);
 ///
+/// \brief Turn on MKLDNN int8.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnInt8(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use the MKLDNN int8.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the MKLDNN int8.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnInt8Enabled(
+    __pd_keep PD_Config* pd_config);
+///
 /// \brief Enable the GPU multi-computing stream feature.
 /// NOTE: The current behavior of this interface is to bind the computation
 /// stream to the thread, and this behavior may be changed in the future.
@@ -625,6 +643,12 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableGpuMultiStream(
 PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigThreadLocalStreamEnabled(
    __pd_keep PD_Config* pd_config);
 ///
+/// \brief Set execution stream. If not set a stream will be created
+/// internally.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetExecStream(
+    __pd_keep PD_Config* pd_config, void* stream);
+///
 /// \brief Specify the memory buffer of program and parameter.
 /// Used when model and params are loaded directly from memory.
 ///

--- a/paddle/fluid/inference/capi_exp/pd_predictor.cc
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/capi_exp/pd_predictor.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_config.h"
 #include "paddle/fluid/inference/capi_exp/pd_types.h"
 #include "paddle/fluid/inference/capi_exp/pd_utils.h"
 #include "paddle/fluid/inference/capi_exp/types_internal.h"
@@ -38,7 +39,6 @@ __pd_give PD_Predictor* PD_PredictorCreate(__pd_take PD_Config* pd_config) {
  paddle_infer::Config* config =
      reinterpret_cast<paddle_infer::Config*>(pd_config);
  pd_predictor->predictor = paddle_infer::CreatePredictor(*config);
-  delete config;
  return pd_predictor;
 }
@@ -57,6 +57,30 @@ __pd_give PD_OneDimArrayCstr* PD_PredictorGetInputNames(
  return paddle_infer::CvtVecToOneDimArrayCstr(names);
 }
+__pd_give PD_IOInfos* PD_PredictorGetInputInfos(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  std::vector<std::string> names = predictor->GetInputNames();
+  std::map<std::string, std::vector<int64_t>> input_shapes =
+      predictor->GetInputTensorShape();
+  std::map<std::string, paddle_infer::DataType> input_dtypes =
+      predictor->GetInputTypes();
+  PD_IOInfos* input_infos = new PD_IOInfos;
+  input_infos->size = names.size();
+  input_infos->io_info = names.empty() ? NULL : new PD_IOInfo*[names.size()];
+  for (size_t i = 0; i < names.size(); i++) {
+    const std::string& name = names[i];
+    input_infos->io_info[i] = new PD_IOInfo;
+    input_infos->io_info[i]->name = paddle_infer::CvtStrToCstr(name);
+    input_infos->io_info[i]->shape =
+        paddle_infer::CvtVecToOneDimArrayInt64(input_shapes[name]);
+    input_infos->io_info[i]->dtype =
+        paddle_infer::CvtFromCxxDatatype(input_dtypes[name]);
+  }
+  return input_infos;
+}
 __pd_give PD_OneDimArrayCstr* PD_PredictorGetOutputNames(
    __pd_keep PD_Predictor* pd_predictor) {
  CHECK_AND_CONVERT_PD_PREDICTOR;
@@ -64,6 +88,30 @@ __pd_give PD_OneDimArrayCstr* PD_PredictorGetOutputNames(
  return paddle_infer::CvtVecToOneDimArrayCstr(names);
 }
+__pd_give PD_IOInfos* PD_PredictorGetOutputInfos(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  std::vector<std::string> names = predictor->GetOutputNames();
+  std::map<std::string, std::vector<int64_t>> output_shapes =
+      predictor->GetOutputTensorShape();
+  std::map<std::string, paddle_infer::DataType> output_dtypes =
+      predictor->GetOutputTypes();
+  PD_IOInfos* output_infos = new PD_IOInfos;
+  output_infos->size = names.size();
+  output_infos->io_info = names.empty() ? NULL : new PD_IOInfo*[names.size()];
+  for (size_t i = 0; i < names.size(); i++) {
+    const std::string& name = names[i];
+    output_infos->io_info[i] = new PD_IOInfo;
+    output_infos->io_info[i]->name = paddle_infer::CvtStrToCstr(name);
+    output_infos->io_info[i]->shape =
+        paddle_infer::CvtVecToOneDimArrayInt64(output_shapes[name]);
+    output_infos->io_info[i]->dtype =
+        paddle_infer::CvtFromCxxDatatype(output_dtypes[name]);
+  }
+  return output_infos;
+}
 size_t PD_PredictorGetInputNum(__pd_keep PD_Predictor* pd_predictor) {
  CHECK_AND_CONVERT_PD_PREDICTOR;
  return predictor->GetInputNames().size();

--- a/paddle/fluid/inference/capi_exp/pd_predictor.h
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.h
@@ -30,6 +30,7 @@ typedef struct PD_Predictor PD_Predictor;
 typedef struct PD_Config PD_Config;
 typedef struct PD_Tensor PD_Tensor;
 typedef struct PD_OneDimArrayCstr PD_OneDimArrayCstr;
+typedef struct PD_IOInfos PD_IOInfos;
 #ifdef __cplusplus
 extern "C" {
@@ -60,6 +61,14 @@ PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorClone(
 PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr*
 PD_PredictorGetInputNames(__pd_keep PD_Predictor* pd_predictor);
 ///
+/// \brief Get the input infos(name/shape/dtype)
+///
+/// \param[in] pd_predictor predictor
+/// \return input infos(name/shape/dtype)
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_IOInfos* PD_PredictorGetInputInfos(
+    __pd_keep PD_Predictor* pd_predictor);
+///
 /// \brief Get the output names
 ///
 /// \param[in] pd_predictor predictor
@@ -67,7 +76,14 @@ PD_PredictorGetInputNames(__pd_keep PD_Predictor* pd_predictor);
 ///
 PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr*
 PD_PredictorGetOutputNames(__pd_keep PD_Predictor* pd_predictor);
+///
+/// \brief Get the output infos(name/shape/dtype)
+///
+/// \param[in] pd_predictor predictor
+/// \return output infos(name/shape/dtype)
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_IOInfos* PD_PredictorGetOutputInfos(
+    __pd_keep PD_Predictor* pd_predictor);
 ///
 /// \brief Get the input number
 ///

--- a/paddle/fluid/inference/capi_exp/pd_types.h
+++ b/paddle/fluid/inference/capi_exp/pd_types.h
@@ -29,6 +29,11 @@ typedef struct PD_OneDimArraySize {
  size_t* data;
 } PD_OneDimArraySize;  // std::vector<size_t>
+typedef struct PD_OneDimArrayInt64 {
+  size_t size;
+  int64_t* data;
+} PD_OneDimArrayInt64;  // std::vector<int64_t>
 typedef struct PD_OneDimArrayCstr {
  size_t size;
  char** data;
@@ -43,3 +48,14 @@ typedef struct PD_TwoDimArraySize {
  size_t size;
  PD_OneDimArraySize** data;
 } PD_TwoDimArraySize;  // std::vector<std::vector<size_t>>
+typedef struct PD_IOInfo {
+  PD_Cstr* name;
+  PD_OneDimArrayInt64* shape;
+  PD_DataType dtype;
+} PD_IOInfo;  // input or output info
+typedef struct PD_IOInfos {
+  size_t size;
+  PD_IOInfo** io_info;
+} PD_IOInfos;  // inputs or outputs info
--- a/paddle/fluid/inference/capi_exp/pd_utils.cc
+++ b/paddle/fluid/inference/capi_exp/pd_utils.cc
@@ -11,12 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/capi_exp/pd_utils.h"
 #include <string>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
 #include "paddle/fluid/inference/capi_exp/utils_internal.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -62,6 +60,7 @@
 ONE_DIM_ARRAY_UTILS_FUNC_IMPL(int32_t, Int32, int)
 ONE_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t)
+ONE_DIM_ARRAY_UTILS_FUNC_IMPL(int64_t, Int64, int64_t)
 #undef ONE_DIM_ARRAY_UTILS_FUNC_IMPL
 #undef CONVERT_ONE_DIM_ARRAY_TO_VEC
@@ -178,6 +177,38 @@ TWO_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t)
 #undef CONVERT_VEC_TO_TWO_DIM_ARRAY
 #undef DESTROY_TWO_DIM_ARRAY
+#ifdef __cplusplus
+extern "C" {
+#endif
+void PD_IOInfoDestroy(__pd_take PD_IOInfo* io_info) {
+  if (io_info != NULL) {
+    PD_CstrDestroy(io_info->name);
+    io_info->name = NULL;
+    PD_OneDimArrayInt64Destroy(io_info->shape);
+    io_info->shape = NULL;
+    delete io_info;
+  }
+}
+void PD_IOInfosDestroy(__pd_take PD_IOInfos* io_infos) {
+  if (io_infos != NULL) {
+    if (io_infos->size != 0) {
+      for (size_t index = 0; index < io_infos->size; ++index) {
+        PD_IOInfoDestroy(io_infos->io_info[index]);
+      }
+      io_infos->size = 0;
+    }
+    delete[] io_infos->io_info;
+    io_infos->io_info = NULL;
+    delete io_infos;
+  }
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 namespace paddle_infer {
 PlaceType CvtToCxxPlaceType(PD_PlaceType place_type) {

--- a/paddle/fluid/inference/capi_exp/pd_utils.h
+++ b/paddle/fluid/inference/capi_exp/pd_utils.h
@@ -41,6 +41,14 @@ extern "C" {
 PADDLE_CAPI_EXPORT extern void PD_OneDimArrayInt32Destroy(
    __pd_take PD_OneDimArrayInt32* array);
+///
+/// \brief Destroy the PD_OneDimArrayInt64 object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArrayInt64 object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArrayInt64Destroy(
+    __pd_take PD_OneDimArrayInt64* array);
 ///
 /// \brief Destroy the PD_OneDimArrayCstr object pointed to by the pointer.
 ///
@@ -74,6 +82,21 @@ PADDLE_CAPI_EXPORT extern void PD_TwoDimArraySizeDestroy(
 ///
 PADDLE_CAPI_EXPORT extern void PD_CstrDestroy(__pd_take PD_Cstr* cstr);
+///
+/// \brief Destroy the PD_IOInfo object pointed to by the pointer.
+///
+/// \param[in] cstr pointer to the PD_IOInfo object.
+///
+PADDLE_CAPI_EXPORT extern void PD_IOInfoDestroy(__pd_take PD_IOInfo* io_info);
+///
+/// \brief Destroy the PD_IOInfos object pointed to by the pointer.
+///
+/// \param[in] cstr pointer to the PD_IOInfos object.
+///
+PADDLE_CAPI_EXPORT extern void PD_IOInfosDestroy(
+    __pd_take PD_IOInfos* io_infos);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/paddle/fluid/inference/capi_exp/utils_internal.h
+++ b/paddle/fluid/inference/capi_exp/utils_internal.h
@@ -44,6 +44,16 @@ namespace paddle_infer {
 __pd_give PD_OneDimArrayInt32* CvtVecToOneDimArrayInt32(
    const std::vector<int>& vec);
+///
+/// \brief Convert the 'std::vector<int64_t>' object to a 'PD_OneDimArrayInt64'
+/// object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArrayInt64* CvtVecToOneDimArrayInt64(
+    const std::vector<int64_t>& vec);
 ///
 /// \brief Convert the 'PD_OneDimArrayInt32' object to a 'std::vector<int>'
 /// object.
@@ -54,6 +64,16 @@ __pd_give PD_OneDimArrayInt32* CvtVecToOneDimArrayInt32(
 std::vector<int> CvtOneDimArrayToVecInt32(
    __pd_keep const PD_OneDimArrayInt32* array);
+///
+/// \brief Convert the 'PD_OneDimArrayInt64' object to a 'std::vector<int64_t>'
+/// object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<int64_t> CvtOneDimArrayToVecInt64(
+    __pd_keep const PD_OneDimArrayInt64* array);
 ///
 /// \brief Convert the 'std::vector<size_t>' object to a 'PD_OneDimArraySize'
 /// object.

--- a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp
+++ b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp
@@ -161,7 +161,8 @@ JNIEXPORT void JNICALL Java_com_baidu_paddle_inference_Config_enableUseGpu(
    jint deviceId) {
  PD_ConfigEnableUseGpu(reinterpret_cast<PD_Config*>(cppPaddleConfigPointer),
                        (uint64_t)memorySize,
-                        (int32_t)deviceId);
+                        (int32_t)deviceId,
+                        0);
 }
 JNIEXPORT void JNICALL Java_com_baidu_paddle_inference_Config_disableGpu(

--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -157,7 +157,7 @@ func (config *Config) UseFcPadding() bool {
 /// \param deviceId the GPU card to use.
 ///
 func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) {
-	C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId))
+	C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId), 0)
 }
 ///

--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
@@ -19,6 +19,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
+#if defined(PADDLE_WITH_CUDA)
+#include <cuda_runtime.h>
+#endif
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
@@ -37,7 +41,7 @@ TEST(PD_Config, gpu_interface) {
  PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str());
  PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str());
-  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableUseGpu(config, 100, 0, 0);
  bool use_gpu = PD_ConfigUseGpu(config);
  EXPECT_TRUE(use_gpu);
  int init_size = PD_ConfigMemoryPoolInitSizeMb(config);
@@ -84,6 +88,14 @@ TEST(PD_Config, gpu_interface) {
  bool thread_local_thread = PD_ConfigThreadLocalStreamEnabled(config);
  EXPECT_TRUE(thread_local_thread);
+#if defined(PADDLE_WITH_CUDA)
+  {
+    cudaStream_t external_stream;
+    cudaStreamCreate(&external_stream);
+    PD_ConfigSetExecStream(config, external_stream);
+  }
+#endif
  PD_ConfigDisableGpu(config);
  PD_ConfigDestroy(config);
 }
@@ -104,7 +116,7 @@ TEST(PD_Config, use_gpu) {
  const char* model_dir_ = PD_ConfigGetModelDir(config);
  LOG(INFO) << model_dir_;
-  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableUseGpu(config, 100, 0, 0);
  bool use_gpu = PD_ConfigUseGpu(config);
  EXPECT_TRUE(use_gpu);
  int device_id = PD_ConfigGpuDeviceId(config);
@@ -142,7 +154,7 @@ TEST(PD_Config, use_gpu) {
 TEST(PD_Config, trt_int8) {
  std::string model_dir = FLAGS_infer_model + "/mobilenet";
  PD_Config* config = PD_ConfigCreate();
-  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableUseGpu(config, 100, 0, 0);
  PD_ConfigEnableTensorRtEngine(
      config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE, TRUE);
  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
@@ -153,7 +165,7 @@ TEST(PD_Config, trt_int8) {
 TEST(PD_Config, trt_fp16) {
  std::string model_dir = FLAGS_infer_model + "/mobilenet";
  PD_Config* config = PD_ConfigCreate();
-  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableUseGpu(config, 100, 0, 0);
  PD_ConfigEnableTensorRtEngine(
      config, 1 << 20, 1, 3, PD_PRECISION_HALF, FALSE, FALSE);
  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);

--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
@@ -37,6 +37,9 @@ void predictor_run() {
  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
  LOG(INFO) << "The inputs' size is: " << input_names->size;
  EXPECT_EQ(input_names->size, 2u);
+  PD_IOInfos* in_infos = PD_PredictorGetInputInfos(predictor);
+  EXPECT_EQ(in_infos->size, 2u);
+  PD_IOInfos* out_infos = PD_PredictorGetOutputInfos(predictor);
  int32_t shape_0[4] = {1, 3, 224, 224};
  float data_0[1 * 3 * 224 * 224] = {0};
@@ -79,6 +82,8 @@ void predictor_run() {
  PD_TensorDestroy(input_1);
  PD_TensorDestroy(input_0);
  PD_OneDimArrayCstrDestroy(input_names);
+  PD_IOInfosDestroy(in_infos);
+  PD_IOInfosDestroy(out_infos);
  PD_PredictorDestroy(predictor);
 }

--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
@@ -85,6 +85,10 @@ TEST(PD_Config, interface) {
  PD_ConfigEnableMkldnnBfloat16(config);
  PD_ConfigSetBfloat16Op(config, 1, &ops_name);
+  PD_ConfigEnableMkldnnInt8(config);
+  bool mkldnn_int8_enabled = PD_ConfigMkldnnInt8Enabled(config);
+  EXPECT_TRUE(mkldnn_int8_enabled);
 #endif
  PD_ConfigEnableONNXRuntime(config);