[IPU] update inference demos (#39792)

* update inference part * restore white space

[IPU] update inference demos (#39792)
* update inference part * restore white space
24f55aed · Allen Guo · GitHub · 4130b640 · 24f55aed · 24f55aed
16 changed file
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -48,11 +48,10 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
+elseif(WITH_IPU)
+  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules} paddle_ipu)
 else()
  create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
-  if(WITH_IPU)
-    target_link_libraries(paddle_inference -Wl,--allow-multiple-definition popart_canonicalization_utils)
-  endif()
 endif()

 if(NOT APPLE)

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -278,10 +278,14 @@ struct Argument {
  // ipu related
  DECL_ARGUMENT_FIELD(use_ipu, UseIpu, bool);
  DECL_ARGUMENT_FIELD(ipu_device_num, IpuDeviceNum, int);
+  DECL_ARGUMENT_FIELD(ipu_micro_batch_size, IpuMicroBatchSize, int);
  DECL_ARGUMENT_FIELD(ipu_enable_pipelining, IpuEnablePipelining, bool);
  DECL_ARGUMENT_FIELD(ipu_batches_per_step, IpuBatchesPerStep, int);
-  DECL_ARGUMENT_FIELD(ipu_batch_size, IpuBatchSize, int);
-  DECL_ARGUMENT_FIELD(ipu_need_avg_shard, IpuNeedAvgShard, bool);
+  DECL_ARGUMENT_FIELD(ipu_enable_fp16, IpuEnableFp16, bool);
+  DECL_ARGUMENT_FIELD(ipu_replica_num, IpuReplicaNum, int);
+  DECL_ARGUMENT_FIELD(ipu_available_memory_proportion,
+                      IpuAvailableMemoryProportion, float);
+  DECL_ARGUMENT_FIELD(ipu_enable_half_partial, IpuEnableHalfPartial, bool);

  // npu related
  DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool);

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -72,17 +72,21 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
    if (argument->use_ipu()) {
      argument->main_graph().SetNotOwned("num_ipus",
                                         &argument->ipu_device_num());
-      argument->main_graph().SetNotOwned("need_avg_shard",
-                                         &argument->ipu_need_avg_shard());
+      argument->main_graph().SetNotOwned("micro_batch_size",
+                                         &argument->ipu_micro_batch_size());
      argument->main_graph().SetNotOwned("enable_pipelining",
                                         &argument->ipu_enable_pipelining());
      argument->main_graph().SetNotOwned("batches_per_step",
                                         &argument->ipu_batches_per_step());
-      argument->main_graph().SetNotOwned("batch_size",
-                                         &argument->ipu_batch_size());
-    } else {
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Please compile with WITH_IPU"));
+      argument->main_graph().SetNotOwned("enable_fp16",
+                                         &argument->ipu_enable_fp16());
+      argument->main_graph().SetNotOwned("replica_num",
+                                         &argument->ipu_replica_num());
+      argument->main_graph().SetNotOwned(
+          "available_memory_proportion",
+          &argument->ipu_available_memory_proportion());
+      argument->main_graph().SetNotOwned("enable_half_partial",
+                                         &argument->ipu_enable_half_partial());
    }
  }
 #endif

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -142,17 +142,28 @@ void AnalysisConfig::EnableNpu(int device_id) {

  Update();
 }
-void AnalysisConfig::EnableIpu(int device_num, bool ipu_enable_pipelining,
-                               int ipu_batches_per_step, int ipu_batch_size,
-                               bool ipu_need_avg_shard) {
+
+void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size,
+                               bool ipu_enable_pipelining,
+                               int ipu_batches_per_step) {
  enable_ir_optim_ = true;

  use_ipu_ = true;
-  ipu_device_num_ = device_num;
+  ipu_device_num_ = ipu_device_num;
+  ipu_micro_batch_size_ = ipu_micro_batch_size;
  ipu_enable_pipelining_ = ipu_enable_pipelining;
  ipu_batches_per_step_ = ipu_batches_per_step;
-  ipu_batch_size_ = ipu_batch_size;
-  ipu_need_avg_shard_ = ipu_need_avg_shard;
+
+  Update();
+}
+
+void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num,
+                                  float ipu_available_memory_proportion,
+                                  bool ipu_enable_half_partial) {
+  ipu_enable_fp16_ = ipu_enable_fp16;
+  ipu_replica_num_ = ipu_replica_num;
+  ipu_available_memory_proportion_ = ipu_available_memory_proportion;
+  ipu_enable_half_partial_ = ipu_enable_half_partial;

  Update();
 }
@@ -255,10 +266,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  // ipu related
  CP_MEMBER(use_ipu_);
  CP_MEMBER(ipu_device_num_);
+  CP_MEMBER(ipu_micro_batch_size_);
  CP_MEMBER(ipu_enable_pipelining_);
  CP_MEMBER(ipu_batches_per_step_);
-  CP_MEMBER(ipu_batch_size_);
-  CP_MEMBER(ipu_need_avg_shard_);
+  CP_MEMBER(ipu_enable_fp16_);
+  CP_MEMBER(ipu_replica_num_);
+  CP_MEMBER(ipu_available_memory_proportion_);
+  CP_MEMBER(ipu_enable_half_partial_);

  if (use_gpu_) {
    PADDLE_ENFORCE_EQ(use_xpu_, false,
@@ -684,10 +698,13 @@ std::string AnalysisConfig::SerializeInfoCache() {

  ss << use_ipu_;
  ss << ipu_device_num_;
+  ss << ipu_micro_batch_size_;
  ss << ipu_enable_pipelining_;
  ss << ipu_batches_per_step_;
-  ss << ipu_batch_size_;
-  ss << ipu_need_avg_shard_;
+  ss << ipu_enable_fp16_;
+  ss << ipu_replica_num_;
+  ss << ipu_available_memory_proportion_;
+  ss << ipu_enable_half_partial_;

  return ss.str();
 }

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -93,6 +93,8 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
    input_ptr = t->mutable_data<float>(ddim, place);
  } else if (pt.dtype == PaddleDType::INT32) {
    input_ptr = t->mutable_data<int32_t>(ddim, place);
+  } else if (pt.dtype == PaddleDType::FLOAT16) {
+    input_ptr = t->mutable_data<float16>(ddim, place);
  } else {
    LOG(ERROR) << "unsupported feed type " << pt.dtype;
    return false;
@@ -563,8 +565,12 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
    } else if (type == framework::proto::VarType::INT32) {
      GetFetchOne<int32_t>(fetch, output);
      output->dtype = PaddleDType::INT32;
+    } else if (type == framework::proto::VarType::FP16) {
+      GetFetchOne<float16>(fetch, output);
+      output->dtype = PaddleDType::FLOAT16;
    } else {
-      LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
+      LOG(ERROR) << "unknown type, only support float32, float16, int64 and "
+                    "int32 now.";
    }
  }
  return true;
@@ -662,12 +668,18 @@ void AnalysisPredictor::PrepareArgument() {
    LOG(INFO) << "Lite subgraph engine is enabled";
  }

+#ifdef PADDLE_WITH_IPU
  argument_.SetUseIpu(config_.use_ipu_);
  argument_.SetIpuDeviceNum(config_.ipu_device_num());
+  argument_.SetIpuMicroBatchSize(config_.ipu_micro_batch_size_);
  argument_.SetIpuEnablePipelining(config_.ipu_enable_pipelining_);
  argument_.SetIpuBatchesPerStep(config_.ipu_batches_per_step_);
-  argument_.SetIpuBatchSize(config_.ipu_batch_size_);
-  argument_.SetIpuNeedAvgShard(config_.ipu_need_avg_shard_);
+  argument_.SetIpuEnableFp16(config_.ipu_enable_fp16_);
+  argument_.SetIpuReplicaNum(config_.ipu_replica_num_);
+  argument_.SetIpuAvailableMemoryProportion(
+      config_.ipu_available_memory_proportion_);
+  argument_.SetIpuEnableHalfPartial(config_.ipu_enable_half_partial_);
+#endif

  argument_.SetUseNpu(config_.use_npu_);
  argument_.SetNPUDeviceId(config_.npu_device_id());

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -234,20 +234,30 @@ struct PD_INFER_DECL AnalysisConfig {
  ///
  /// \brief Turn on IPU.
  ///
-  /// \param device_num The number of IPUs.
-  /// \param ipu_enable_pipelining Enable data pipelining between subgraphs,
-  /// each subgraph is settled on an IPU. (This feature requires the number of
-  /// IPUs > 1.)
-  /// \param ipu_batches_per_step The number of micro_batch_size per run. (This
-  /// feature requires to enable pipelining.)
-  /// \param ipu_batch_size The micro_batch_size which is the batch_size in the
-  /// graph.
-  /// \param ipu_need_avg_shard Enable the auto graph sharding. (This feature
-  /// requires the number of IPUs > 1.)
-  ///
-  void EnableIpu(int device_num = 1, bool ipu_enable_pipelining = false,
-                 int ipu_batches_per_step = 1, int ipu_batch_size = 1,
-                 bool ipu_need_avg_shard = false);
+  /// \param ipu_device_num the number of IPUs.
+  /// \param ipu_micro_batch_size the batch size in the graph, only work with
+  /// mutable input shapes.
+  /// \param ipu_enable_pipelining enable pipelining.
+  /// \param ipu_batches_per_step the number of batches per run in pipelining.
+  ///
+  void EnableIpu(int ipu_device_num = 1, int ipu_micro_batch_size = 1,
+                 bool ipu_enable_pipelining = false,
+                 int ipu_batches_per_step = 1);
+
+  ///
+  /// \brief Set IPU config.
+  ///
+  /// \param ipu_enable_fp16 enable fp16.
+  /// \param ipu_replica_num the number of graph replication.
+  /// \param ipu_available_memory_proportion the available memory proportion for
+  /// matmul/conv.
+  /// \param ipu_enable_half_partial enable fp16 partial for matmul, only work
+  /// with fp16.
+  ///
+  void SetIpuConfig(bool ipu_enable_fp16 = false, int ipu_replica_num = 1,
+                    float ipu_available_memory_proportion = 1.0,
+                    bool ipu_enable_half_partial = false);
+
  ///
  /// \brief Set XPU device id.
  ///
@@ -876,11 +886,14 @@ struct PD_INFER_DECL AnalysisConfig {
  // ipu related.
  bool use_ipu_{false};
  int ipu_device_num_{1};
-
+  int ipu_micro_batch_size_{1};
  bool ipu_enable_pipelining_{false};
  int ipu_batches_per_step_{1};
-  int ipu_batch_size_{1};
-  bool ipu_need_avg_shard_{false};
+
+  bool ipu_enable_fp16_{false};
+  int ipu_replica_num_{1};
+  float ipu_available_memory_proportion_{1.0};
+  bool ipu_enable_half_partial_{false};

  // If the config is already used on a predictor, it becomes invalid.
  // Any config can only be used with one predictor.

--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -45,7 +45,7 @@ enum DataType {
  // TODO(Superjomn) support more data types if needed.
 };

-enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU };
+enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU };

 /// \brief Represents an n-dimensional array of values.
 /// The Tensor is used to store the input or output of the network.

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -758,11 +758,30 @@ if(ON_INFER OR WITH_GPU)
    set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120)
 endif()

-# IPU
 if (WITH_IPU)
-    #resnet50
+    #word2vec sample
+    set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec/word2vec.inference.model")
+    inference_analysis_test(ipu_word2vec_sample SRCS ipu_word2vec_sample.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${WORD2VEC_INSTALL_DIR})
+
+    # ERNIE
+    set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
+    inference_analysis_api_test(ipu_ernie_test ${ERNIE_INSTALL_DIR} ipu_ernie_test.cc
+            ARGS --warmup=true --repeat=10)
+    inference_analysis_api_test(ipu_ernie_fp16_test ${ERNIE_INSTALL_DIR} ipu_ernie_fp16_test.cc
+            ARGS --warmup=true --repeat=10)
+
+    # Resnet50
    set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
    inference_analysis_test(ipu_resnet50_test SRCS ipu_resnet50_test.cc
        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=1000)
+        ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
+    inference_analysis_test(ipu_resnet50_fp16_test SRCS ipu_resnet50_fp16_test.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
+
+    # Only support Resnet50 and Ernie currently
+    inference_analysis_api_test(ipu_multi_model_profile SRCS ipu_multi_model_profile.cc
+        ARGS --model_name="Resnet50" --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
 endif()
--- a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
+++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
@@ -150,8 +150,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false,

 void SetIpuConfig(AnalysisConfig *cfg, int batch_size = 1) {
  cfg->SetModel(FLAGS_infer_model);
-  // num_ipu, enable_pipelining, batches_per_step, batch_size, need_avg_shard
-  cfg->EnableIpu(4, false, 1, batch_size, true);
+  cfg->EnableIpu(4, batch_size, false, 1);
 }

 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc
+++ b/paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  tensors->clear();
+  tensors->reserve(4);
+
+  int i = 0;
+  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
+  for (; i < 3; i++) {
+    paddle::PaddleTensor temp;
+    ParseTensor<int64_t>(fields[i], &temp);
+    temp.name = input_name + std::to_string(i);
+    tensors->push_back(temp);
+  }
+
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i], &input_mask);
+  // fp32 to fp16
+  ConvertFP32toFP16(input_mask);
+  input_mask.name = input_name + std::to_string(i);
+  tensors->push_back(input_mask);
+
+  return true;
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs,
+                   int batch_size = 1) {
+  if (FLAGS_infer_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+  int sample = 0;
+
+  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    ParseLine(line, &feed_data);
+    inputs->push_back(std::move(feed_data));
+    sample++;
+    if (!FLAGS_test_all_data && sample == batch_size) break;
+  }
+  LOG(INFO) << "number of samples: " << sample;
+  return true;
+}
+
+void SetConfig(AnalysisConfig *cfg, int batch_size = 1) {
+  cfg->SetModel(FLAGS_infer_model);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  cfg->EnableIpu(1, batch_size, false);
+  // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion,
+  // ipu_enable_half_partial
+  cfg->SetIpuConfig(true, 1, 1.0, true);
+}
+
+// Compare results
+TEST(Analyzer_Ernie_ipu, compare_results) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+
+  auto predictor = CreateTestPredictor(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+      FLAGS_use_analysis);
+
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+
+    auto output = outputs.front();
+    ConvertFP16toFP32(output);
+    auto outputs_size = 1;
+    for (auto dim : output.shape) {
+      outputs_size *= dim;
+    }
+    float *fp32_data = reinterpret_cast<float *>(output.data.data());
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j], fp32_data[j], 5e-3);
+    }
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/ipu_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/ipu_ernie_test.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  tensors->clear();
+  tensors->reserve(4);
+
+  int i = 0;
+  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
+  for (; i < 3; i++) {
+    paddle::PaddleTensor temp;
+    ParseTensor<int64_t>(fields[i], &temp);
+    temp.name = input_name + std::to_string(i);
+    tensors->push_back(temp);
+  }
+
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i], &input_mask);
+  input_mask.name = input_name + std::to_string(i);
+  tensors->push_back(input_mask);
+
+  return true;
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs,
+                   int batch_size = 1) {
+  if (FLAGS_infer_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+  int sample = 0;
+
+  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    ParseLine(line, &feed_data);
+    inputs->push_back(std::move(feed_data));
+    sample++;
+    if (!FLAGS_test_all_data && sample == batch_size) break;
+  }
+  LOG(INFO) << "number of samples: " << sample;
+  return true;
+}
+
+void SetConfig(AnalysisConfig *cfg, int batch_size = 1) {
+  cfg->SetModel(FLAGS_infer_model);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  cfg->EnableIpu(1, batch_size, false);
+}
+
+void profile() {
+  AnalysisConfig config;
+  SetConfig(&config);
+
+  std::vector<std::vector<PaddleTensor>> outputs;
+  std::vector<std::vector<PaddleTensor>> inputs;
+  LoadInputData(&inputs);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
+                 inputs, &outputs, FLAGS_num_threads);
+}
+
+// Compare Deterministic result
+TEST(Analyzer_Ernie_ipu, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
+// Compare results
+TEST(Analyzer_Ernie_ipu, compare_results) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+
+  auto predictor = CreateTestPredictor(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+      FLAGS_use_analysis);
+
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+    auto outputs_size = outputs.front().data.length() / (sizeof(float));
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j],
+                  static_cast<float *>(outputs[0].data.data())[j],
+                  FLAGS_accuracy);
+    }
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc
+++ b/paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+void ErnieInputData(const int &total_batch_size, const bool enable_fp16,
+                    std::vector<PaddleTensor> *inputs) {
+  const int input_num = total_batch_size * 128 * 1;
+  std::vector<int64_t> placeholder_012(input_num, 1);
+  std::vector<float> placeholder_3(input_num, 1);
+
+  for (int i = 0; i < 4; i++) {
+    PaddleTensor in;
+    in.name = "placeholder_" + std::to_string(i);
+    in.shape = {total_batch_size, 128, 1};
+    if (i < 3) {
+      in.data = PaddleBuf(static_cast<void *>(placeholder_012.data()),
+                          input_num * sizeof(int64_t));
+      in.dtype = PaddleDType::INT64;
+    } else {
+      in.data = PaddleBuf(static_cast<void *>(placeholder_3.data()),
+                          input_num * sizeof(float));
+      in.dtype = PaddleDType::FLOAT32;
+      if (enable_fp16) {
+        ConvertFP32toFP16(in);
+      }
+    }
+    inputs->push_back(std::move(in));
+  }
+}
+
+void Resnet50InputData(const int &total_batch_size, const bool enable_fp16,
+                       std::vector<paddle::PaddleTensor> *inputs) {
+  const int input_num = total_batch_size * 3 * 318 * 318;
+  std::vector<float> input(input_num, 1);
+  PaddleTensor in;
+  in.shape = {total_batch_size, 3, 318, 318};
+  in.data =
+      PaddleBuf(static_cast<void *>(input.data()), input_num * sizeof(float));
+  in.dtype = PaddleDType::FLOAT32;
+  if (enable_fp16) {
+    ConvertFP32toFP16(in);
+  }
+  inputs->push_back(std::move(in));
+}
+
+// performance profile
+TEST(Analyzer_ipu_fp16, performance_profile) {
+  AnalysisConfig config;
+  std::vector<PaddleTensor> inputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
+
+  int total_batch_size = FLAGS_ipu_micro_batch_size * FLAGS_ipu_replica_num;
+  if (FLAGS_ipu_enable_pipelining) {
+    // if device_num > 1 and pipelining is enabled, the total batch size =
+    // micro_batch_size * device_num(batches_per_step) * replica_num
+    total_batch_size = FLAGS_ipu_micro_batch_size * FLAGS_ipu_batches_per_step *
+                       FLAGS_ipu_replica_num;
+  }
+
+  if (FLAGS_model_name == "Resnet50") {
+    config.SetModel(FLAGS_infer_model + "/model/model",
+                    FLAGS_infer_model + "/model/params");
+    Resnet50InputData(total_batch_size, FLAGS_ipu_enable_fp16, &inputs);
+  } else if (FLAGS_model_name == "Ernie") {
+    config.SetModel(FLAGS_infer_model + "/model/");
+    ErnieInputData(total_batch_size, FLAGS_ipu_enable_fp16, &inputs);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Only support Resnet50 and Ernie Currently"));
+  }
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining,
+  // ipu_batches_per_step
+  config.EnableIpu(FLAGS_ipu_device_num, FLAGS_ipu_micro_batch_size,
+                   FLAGS_ipu_enable_pipelining, FLAGS_ipu_batches_per_step);
+  // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion,
+  // ipu_enable_half_partial
+  config.SetIpuConfig(FLAGS_ipu_enable_fp16, FLAGS_ipu_replica_num,
+                      FLAGS_ipu_available_memory_proportion,
+                      FLAGS_ipu_enable_half_partial);
+
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
+                 {inputs}, &outputs, 1);
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
+++ b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cmath>
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+// Compare results with 1 batch
+TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) {
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  AnalysisConfig config;
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  config.EnableIpu(1, 1, false);
+  // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion,
+  // ipu_enable_half_partial
+  config.SetIpuConfig(true, 1, 1.0, true);
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+
+  std::vector<PaddleTensor> inputs;
+  auto predictor = CreatePaddlePredictor(config);
+  const int batch = 1;
+  const int channel = 3;
+  const int height = 318;
+  const int width = 318;
+  const int input_num = batch * channel * height * width;
+  std::vector<float> input(input_num, 1);
+
+  PaddleTensor in;
+  in.shape = {batch, channel, height, width};
+  in.data =
+      PaddleBuf(static_cast<void*>(input.data()), input_num * sizeof(float));
+  in.dtype = PaddleDType::FLOAT32;
+  ConvertFP32toFP16(in);
+  inputs.emplace_back(in);
+
+  std::vector<PaddleTensor> outputs;
+
+  ASSERT_TRUE(predictor->Run(inputs, &outputs));
+
+  const std::vector<float> truth_values = {
+      127.779f,  738.165f,  1013.22f,  -438.17f,  366.401f,  927.659f,
+      736.222f,  -633.684f, -329.927f, -430.155f, -633.062f, -146.548f,
+      -1324.28f, -1349.36f, -242.675f, 117.448f,  -801.723f, -391.514f,
+      -404.818f, 454.16f,   515.48f,   -133.031f, 69.293f,   590.096f,
+      -1434.69f, -1070.89f, 307.074f,  400.525f,  -316.12f,  -587.125f,
+      -161.056f, 800.363f,  -96.4708f, 748.706f,  868.174f,  -447.938f,
+      112.737f,  1127.2f,   47.4355f,  677.72f,   593.186f,  -336.4f,
+      551.362f,  397.823f,  78.3979f,  -715.398f, 405.969f,  404.256f,
+      246.019f,  -8.42969f, 131.365f,  -648.051f};
+
+  const size_t expected_size = 1;
+  EXPECT_EQ(outputs.size(), expected_size);
+
+  auto output = outputs.front();
+  ConvertFP16toFP32(output);
+  auto outputs_size = 1;
+  for (auto dim : output.shape) {
+    outputs_size *= dim;
+  }
+  float* fp32_data = reinterpret_cast<float*>(output.data.data());
+
+  for (size_t j = 0; j < outputs_size; j += 10) {
+    EXPECT_NEAR((fp32_data[j] - truth_values[j / 10]) / truth_values[j / 10],
+                0., 9e-2);
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
@@ -33,9 +33,8 @@ static std::vector<float> truth_values = {
 TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) {
  std::string model_dir = FLAGS_infer_model + "/" + "model";
  AnalysisConfig config;
-  // num_ipu, enable_pipelining, batches_per_step, batch_size,
-  // need_avg_shard
-  config.EnableIpu(1, false);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  config.EnableIpu(1, 1, false);
  config.SetModel(model_dir + "/model", model_dir + "/params");

  std::vector<PaddleTensor> inputs;
@@ -72,9 +71,8 @@ TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) {
 TEST(Analyzer_Resnet50_ipu, compare_results_2_batch) {
  std::string model_dir = FLAGS_infer_model + "/" + "model";
  AnalysisConfig config;
-  // num_ipu, enable_pipelining, batches_per_step, batch_size,
-  // need_avg_shard
-  config.EnableIpu(2, false, 1, 2, 1);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  config.EnableIpu(1, 2, false);
  config.SetModel(model_dir + "/model", model_dir + "/params");

  std::vector<PaddleTensor> inputs;

--- a/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
+++ b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains a simple demo for how to take a model for inference with
+ * IPUs.
+ * Model: wget -q
+ * http://paddle-inference-dist.bj.bcebos.com/word2vec.inference.model.tar.gz
+ */
+
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+DEFINE_string(infer_model, "", "Directory of the inference model.");
+
+using paddle_infer::Config;
+using paddle_infer::Predictor;
+using paddle_infer::CreatePredictor;
+
+void inference(std::string model_path, bool use_ipu,
+               std::vector<float> *out_data) {
+  //# 1. Create Predictor with a config.
+  Config config;
+  config.SetModel(FLAGS_infer_model);
+  if (use_ipu) {
+    // ipu_device_num, ipu_micro_batch_size
+    config.EnableIpu(1, 4);
+  }
+  auto predictor = CreatePredictor(config);
+
+  //# 2. Prepare input/output tensor.
+  auto input_names = predictor->GetInputNames();
+  std::vector<int64_t> data{1, 2, 3, 4};
+  // For simplicity, we set all the slots with the same data.
+  for (auto input_name : input_names) {
+    auto input_tensor = predictor->GetInputHandle(input_name);
+    input_tensor->Reshape({4, 1});
+    input_tensor->CopyFromCpu(data.data());
+  }
+
+  //# 3. Run
+  predictor->Run();
+
+  //# 4. Get output.
+  auto output_names = predictor->GetOutputNames();
+  auto output_tensor = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_tensor->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data->resize(out_num);
+  output_tensor->CopyToCpu(out_data->data());
+}
+
+int main(int argc, char *argv[]) {
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  std::vector<float> ipu_result;
+  std::vector<float> cpu_result;
+  inference(FLAGS_infer_model, true, &ipu_result);
+  inference(FLAGS_infer_model, false, &cpu_result);
+  for (size_t i = 0; i < ipu_result.size(); i++) {
+    CHECK_NEAR(ipu_result[i], cpu_result[i], 1e-6);
+  }
+  LOG(INFO) << "Finished";
+}
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -76,10 +76,23 @@ DEFINE_int32(cpu_num_threads, 1, "Number of threads for each paddle instance.");
 DEFINE_bool(fuse_multi_gru, false,
            "Running the inference program with multi_gru_fuse_pass");

+// ipu related
+DEFINE_int32(ipu_micro_batch_size, 1, "micro batch size");
+DEFINE_int32(ipu_device_num, 1, "device num");
+DEFINE_bool(ipu_enable_pipelining, false, "enable pipelining");
+DEFINE_int32(ipu_batches_per_step, 1,
+             "the number of batches per run in pipelining");
+DEFINE_bool(ipu_enable_fp16, false, "enable fp16");
+DEFINE_int32(ipu_replica_num, 1, "replica num");
+DEFINE_double(ipu_available_memory_proportion, 1.0,
+              "available memory proportion");
+DEFINE_bool(ipu_enable_half_partial, false, "enable half partial");
+
 namespace paddle {
 namespace inference {

 using paddle::framework::proto::VarType;
+using float16 = paddle::platform::float16;

 template <typename T>
 constexpr paddle::PaddleDType GetPaddleDType();
@@ -1060,5 +1073,44 @@ static bool CompareTensor(const framework::LoDTensor &a,
  return true;
 }

+void ConvertFP32toFP16(paddle::PaddleTensor &tensor  // NOLINT
+                       ) {
+  int num = 1;
+  for (auto dim : tensor.shape) {
+    num *= dim;
+  }
+  PADDLE_ENFORCE_EQ(
+      tensor.dtype, PaddleDType::FLOAT32,
+      platform::errors::InvalidArgument(
+          "The tensor dtype is not float32, only support float32 as input"));
+  float *fp32_data = reinterpret_cast<float *>(tensor.data.data());
+  float16 *fp16_data = new float16[num];
+  for (int i = 0; i < num; i++) {
+    fp16_data[i] = float16(fp32_data[i]);
+  }
+  tensor.data =
+      PaddleBuf(static_cast<void *>(fp16_data), num * sizeof(float16));
+  tensor.dtype = PaddleDType::FLOAT16;
+}
+
+void ConvertFP16toFP32(paddle::PaddleTensor &tensor  // NOLINT
+                       ) {
+  int num = 1;
+  for (auto dim : tensor.shape) {
+    num *= dim;
+  }
+  PADDLE_ENFORCE_EQ(
+      tensor.dtype, PaddleDType::FLOAT16,
+      platform::errors::InvalidArgument(
+          "The tensor dtype is not float16, only support float16 as input"));
+  float16 *fp16_data = reinterpret_cast<float16 *>(tensor.data.data());
+  float *fp32_data = new float[num];
+  for (int i = 0; i < num; i++) {
+    fp32_data[i] = static_cast<float>(fp16_data[i]);
+  }
+  tensor.data = PaddleBuf(static_cast<void *>(fp32_data), num * sizeof(float));
+  tensor.dtype = PaddleDType::FLOAT32;
+}
+
 }  // namespace inference
 }  // namespace paddle