+ fast inference

722d9581 · chenxuyi · Meiyim · 815925c5 · 722d9581 · 722d9581
10 changed file
--- a/README.zh.md
+++ b/README.zh.md
@@ -1056,7 +1056,7 @@ ERNIE提供了通过数据蒸馏从而达到模型压缩、加速的开发套件
 如果您采用 `propeller` 完成finetune，则 `BestInferenceExporter` 会在finetune过程中根据预测指标，挑最好的模型生成 inference_model .

 ### 在线预测
-随后您可以使用[PaddleInference C++ API](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_usage/deploy/inference/native_infer.html)将模型的前向预测代码联编到您的生产环境中。或者您可以使用我们为您构建好的python预测引擎来完成一个简单的服务。执行如下指令，便可以开启一个propeller server：
+随后您可以使用[ERNIE fast inference C++ API](./inference/README.md)将模型的前向预测代码联编到您的生产环境中。或者您可以使用我们为您构建好的python预测引擎来完成一个简单的服务。执行如下指令，便可以开启一个propeller server：

 ```script
 python -m propeller.tools.start_server -m /path/to/saved/model -p 8888

--- a/ernie/finetune/classifier.py
+++ b/ernie/finetune/classifier.py
@@ -40,12 +40,12 @@ def create_model(args,
                 is_regression=False,
                 ernie_version="1.0"):

-    src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
-    sent_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
-    pos_ids = fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
-    task_ids = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64')
-    input_mask = fluid.layers.data(name='5', shape=[-1, args.max_seq_len, 1], dtype='float32')
-    qids = fluid.layers.data(name='7', shape=[-1, 1], dtype='int64')
+    src_ids = fluid.layers.data(name='eval_placeholder_0', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    sent_ids = fluid.layers.data(name='eval_placeholder_1', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    pos_ids = fluid.layers.data(name='eval_placeholder_2', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    input_mask = fluid.layers.data(name='eval_placeholder_3', shape=[-1, args.max_seq_len, 1], dtype='float32')
+    task_ids = fluid.layers.data(name='eval_placeholder_4', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    qids = fluid.layers.data(name='eval_placeholder_5', shape=[-1, 1], dtype='int64')

    if is_classify:
        labels = fluid.layers.data(name='6', shape=[-1, 1], dtype='int64')
@@ -87,7 +87,7 @@ def create_model(args,
        else:
            probs = logits
        feed_targets_name = [
-            src_ids.name, sent_ids.name, pos_ids.name, input_mask.name
+            src_ids.name, sent_ids.name,  pos_ids.name, input_mask.name
        ]
        if ernie_version == "2.0":
            feed_targets_name += [task_ids.name]

--- a/inference/README.md
+++ b/inference/README.md
+# ERNIE fast inference (C++)
+
+ERNIE C++ fast inference API提供了一种更为高效的在线预测方案，可以直接联编译至生产环境以获取更好的性能。
+其实现基于[fluid inference](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_usage/deploy/inference/native_infer.html).
+**请确保您的 fluid inference 版本高于 1.6.3 以获得正确的预测结果。**
+本页面提供了一个ERNIE C++ fast inference 的 demo benchmark.
+
+## 准备工作
+
+demo 数据取自XNLI数据集test集合，位于./data 中。采用明文id格式，一行代表一个 batch, 包含四个字段：
+```text
+src_ids, pos_ids, sent_ids, self_attn_mask
+```
+字段之间按照分号(;)分隔；各字段内部包含 `shape` 和 `data` 两部分，按照冒号(:)分隔； `shape` 和 `data` 内部按空格分隔；`self_attn_mask` 为 FLOAT32 类型，其余字段为 INT64 类型。
+
+ERNIE fast inference 需要输入 inference\_model 格式的模型，可以参考[这里](../README.zh.md#生成inference_model)生成 inference\_model .
+
+**使用propeller产出的 inference\_model 只需要`src_ids`，`sent_ids` 两个字段，因此需要适当修改数据文件**
+
+
+## 编译和运行
+
+为了编译本 demo，c++ 编译器需要支持 C++11 标准。
+
+下载对应的 [fluid_inference库](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_usage/deploy/inference/build_and_install_lib_cn.html) , 根据使用的 paddle 的版本和配置状况 (是否使用 avx, mkl, 以及 cuda, cudnn 版本) 选择下载对应的版本并解压，会得到 `fluid_inference` 文件夹，将其放在与`inference.cc`同一级目录。
+
+用以下命令编译：
+``` bash
+cd ./gpu # cd ./cpu
+mkdir build
+cd build
+cmake ..
+make
+```
+
+用以下命令运行：
+```
+./run.sh ../data/sample /path/to/inference_mode_dir
+```
+
+## 性能测试
+
+测试样本：XNLI test集合，输入BatchSize=1, SequenceLength=128.
+重复5遍取平均值。
+
+| mode  | 延迟(ms) |
+| ----- | -----    |
+| CPU（Intel(R) Xeon(R) Gold 5117 CPU @ 2.00GHz (20 线程)） | 8.5 |
+| GPU （P4）  | 29.8818  |
+
--- a/inference/cpu/CMakeLists.txt
+++ b/inference/cpu/CMakeLists.txt
+CMAKE_MINIMUM_REQUIRED(VERSION 3.2)
+PROJECT(inference_demo)
+SET(CMAKE_C_COMPILER gcc)
+SET(CMAKE_CXX_COMPILER g++)
+ADD_COMPILE_OPTIONS(-std=c++11 -g)
+
+SET(FLUID_INFER_LIB fluid_inference)
+SET(FLUID_INC_PATH ${FLUID_INFER_LIB}/paddle/include)
+SET(FLUID_LIB_PATH ${FLUID_INFER_LIB}/paddle/lib)
+
+SET(GLOG_INC_PATH ${FLUID_INFER_LIB}/third_party/install/glog/include)
+SET(GLOG_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/glog/lib)
+
+SET(GFLAGS_INC_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/include)
+SET(GFLAGS_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/lib)
+SET(MKLDNN_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mkldnn/lib)
+SET(MKLML_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mklml/lib)
+
+INCLUDE_DIRECTORIES(${FLUID_INC_PATH})
+INCLUDE_DIRECTORIES(${GLOG_INC_PATH})
+INCLUDE_DIRECTORIES(${GFLAGS_INC_PATH})
+
+LINK_DIRECTORIES(${FLUID_LIB_PATH})
+LINK_DIRECTORIES(${GLOG_LIB_PATH})
+LINK_DIRECTORIES(${GFLAGS_LIB_PATH})
+LINK_DIRECTORIES(${MKLML_LIB_PATH})
+LINK_DIRECTORIES(${MKLDNN_LIB_PATH})
+
+ADD_EXECUTABLE(inference inference.cc)
+TARGET_LINK_LIBRARIES(inference dl paddle_fluid glog gflags pthread)
+
--- a/inference/cpu/inference.cc
+++ b/inference/cpu/inference.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <paddle_inference_api.h>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <vector>
+
+DEFINE_string(model_dir, "", "model directory");
+DEFINE_string(data, "", "input data path");
+DEFINE_int32(repeat, 1, "repeat");
+DEFINE_bool(output_prediction, false, "Whether to output the prediction results.");
+DEFINE_bool(use_gpu, false, "Whether to use GPU for prediction.");
+DEFINE_int32(device, 0, "device.");
+
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+template <typename T>
+constexpr paddle::PaddleDType GetPaddleDType();
+
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
+  return paddle::PaddleDType::INT64;
+}
+
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<float>() {
+  return paddle::PaddleDType::FLOAT32;
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  if (fields.size() <= 2) return false;
+
+  tensors->clear();
+  tensors->reserve(4);
+
+  int i = 0;
+  // src_ids
+  paddle::PaddleTensor src_ids;
+  ParseTensor<int64_t>(fields[i++], &src_ids);
+  src_ids.name = "eval_placeholder_0";
+  tensors->push_back(src_ids);
+
+  // sent_ids
+  paddle::PaddleTensor sent_ids;
+  ParseTensor<int64_t>(fields[i++], &sent_ids);
+  sent_ids.name = "eval_placeholder_1";
+  tensors->push_back(sent_ids);
+
+  // pos_ids
+  paddle::PaddleTensor pos_ids;
+  ParseTensor<int64_t>(fields[i++], &pos_ids);
+  pos_ids.name = "eval_placeholder_2";
+  tensors->push_back(pos_ids);
+
+
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i++], &input_mask);
+  input_mask.name = "eval_placeholder_3";
+  tensors->push_back(input_mask);
+
+  return true;
+}
+
+// Print outputs to log
+void PrintOutputs(const std::vector<paddle::PaddleTensor> &outputs) {
+  //LOG(INFO) << "example_id\tcontradiction\tentailment\tneutral";
+  for (size_t i = 0; i < outputs.front().data.length() / sizeof(float) / 3; i += 1) {
+    std::cout << static_cast<float *>(outputs[0].data.data())[3 * i] << "\t"
+         << static_cast<float *>(outputs[0].data.data())[3 * i + 1] << "\t"
+         << static_cast<float *>(outputs[0].data.data())[3 * i + 2] << std::endl;
+  }
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
+  if (FLAGS_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_data);
+  std::string line;
+
+  int lineno = 0;
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    if (!ParseLine(line, &feed_data)) {
+      LOG(ERROR) << "Parse line[" << lineno << "] error!";
+    } else {
+      inputs->push_back(std::move(feed_data));
+    }
+  }
+
+  return true;
+}
+
+// ernie inference demo
+// Options:
+//     --model_dir: ernie model file directory
+//     --data: data path
+//     --repeat: repeat num
+//     --use_gpu: use gpu
+int main(int argc, char *argv[]) {
+  google::InitGoogleLogging(*argv);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_model_dir.empty()) {
+    LOG(ERROR) << "please set model dir";
+    return -1;
+  }
+
+  paddle::AnalysisConfig config;
+  config.SetModel(FLAGS_model_dir);
+  config.DisableGpu();
+  config.SwitchIrOptim();
+  config.EnableMKLDNN();
+  config.SetCpuMathLibraryNumThreads(20);
+  //config.EnableMemoryOptim();
+
+  auto predictor = CreatePaddlePredictor(config);
+
+  std::vector<std::vector<paddle::PaddleTensor>> inputs;
+  if (!LoadInputData(&inputs)) {
+    LOG(ERROR) << "load input data error!";
+    return -1;
+  }
+
+  std::vector<paddle::PaddleTensor> fetch;
+  int total_time{0};
+  // auto predict_timer = []()
+  int num_samples{0};
+  int count{0};
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    for (auto feed : inputs) {
+      fetch.clear();
+      auto start = std::chrono::system_clock::now();
+      predictor->Run(feed, &fetch);
+      if (FLAGS_output_prediction && i == 0) {
+        PrintOutputs(fetch);
+      }
+      auto end = std::chrono::system_clock::now();
+      count += 1;
+      if (!fetch.empty()) {
+        total_time +=
+            std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+                .count();
+        //num_samples += fetch.front().data.length() / 2 / sizeof(float);
+        num_samples += fetch.front().data.length() / (sizeof(float) * 2);
+      }
+    }
+  }
+
+  auto per_sample_ms =
+      static_cast<float>(total_time) / num_samples;
+  LOG(INFO) << "Run " << num_samples
+            << " samples, average latency: " << per_sample_ms
+            << "ms per sample.";
+  LOG(INFO) << count;
+
+  return 0;
+}
--- a/inference/cpu/run.sh
+++ b/inference/cpu/run.sh
+set -x
+(($# != 2)) && echo "${0} data model" && exit -1
+
+export LD_LIBRARY_PATH=fluid_inference/third_party/install/mkldnn/lib:fluid_inference/third_party/install/mklml/lib:fluid_inference/paddle/lib/:/home/work/cuda-9.0/lib64/:/home/work/cudnn/cudnn_v7_3_1_cuda9.0/lib64/:$LD_LIBRARY_PATH \
+
+./build/inference --logtostderr \
+    --model_dir $2 \
+    --data $1 \
+    --repeat 5 \
+    --output_prediction true \
+    --use_gpu true \
+    --device 0 \
--- a/inference/data/sample
+++ b/inference/data/sample
--- a/inference/gpu/CMakeLists.txt
+++ b/inference/gpu/CMakeLists.txt
+CMAKE_MINIMUM_REQUIRED(VERSION 3.2)
+PROJECT(inference_demo)
+SET(CMAKE_C_COMPILER gcc)
+SET(CMAKE_CXX_COMPILER g++)
+ADD_COMPILE_OPTIONS(-std=c++11 -g)
+
+SET(FLUID_INFER_LIB fluid_inference)
+SET(FLUID_INC_PATH ${FLUID_INFER_LIB}/paddle/include)
+SET(FLUID_LIB_PATH ${FLUID_INFER_LIB}/paddle/lib)
+
+SET(GLOG_INC_PATH ${FLUID_INFER_LIB}/third_party/install/glog/include)
+SET(GLOG_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/glog/lib)
+
+SET(GFLAGS_INC_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/include)
+SET(GFLAGS_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/lib)
+SET(MKLDNN_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mkldnn/lib)
+SET(MKLML_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mklml/lib)
+
+INCLUDE_DIRECTORIES(${FLUID_INC_PATH})
+INCLUDE_DIRECTORIES(${GLOG_INC_PATH})
+INCLUDE_DIRECTORIES(${GFLAGS_INC_PATH})
+
+LINK_DIRECTORIES(${FLUID_LIB_PATH})
+LINK_DIRECTORIES(${GLOG_LIB_PATH})
+LINK_DIRECTORIES(${GFLAGS_LIB_PATH})
+LINK_DIRECTORIES(${MKLML_LIB_PATH})
+LINK_DIRECTORIES(${MKLDNN_LIB_PATH})
+
+ADD_EXECUTABLE(inference inference.cc)
+TARGET_LINK_LIBRARIES(inference dl paddle_fluid glog gflags pthread)
+
--- a/inference/gpu/inference.cc
+++ b/inference/gpu/inference.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <paddle_inference_api.h>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <vector>
+
+DEFINE_string(model_dir, "", "model directory");
+DEFINE_string(data, "", "input data path");
+DEFINE_int32(repeat, 1, "repeat");
+DEFINE_bool(output_prediction, false, "Whether to output the prediction results.");
+DEFINE_bool(use_gpu, false, "Whether to use GPU for prediction.");
+DEFINE_int32(device, 0, "device.");
+
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+template <typename T>
+constexpr paddle::PaddleDType GetPaddleDType();
+
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
+  return paddle::PaddleDType::INT64;
+}
+
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<float>() {
+  return paddle::PaddleDType::FLOAT32;
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  if (fields.size() <= 2) return false;
+
+  tensors->clear();
+  tensors->reserve(4);
+
+  int i = 0;
+  paddle::PaddleTensor src_ids;
+  ParseTensor<int64_t>(fields[i++], &src_ids);
+  src_ids.name = "eval_placeholder_0";
+  tensors->push_back(src_ids);
+
+  // sent_ids
+  paddle::PaddleTensor sent_ids;
+  ParseTensor<int64_t>(fields[i++], &sent_ids);
+  sent_ids.name = "eval_placeholder_1";
+  tensors->push_back(sent_ids);
+
+  // pos_ids
+  paddle::PaddleTensor pos_ids;
+  ParseTensor<int64_t>(fields[i++], &pos_ids);
+  pos_ids.name = "eval_placeholder_2";
+  tensors->push_back(pos_ids);
+
+
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i++], &input_mask);
+  input_mask.name = "eval_placeholder_3";
+  tensors->push_back(input_mask);
+
+  return true;
+}
+
+// Print outputs to log
+void PrintOutputs(const std::vector<paddle::PaddleTensor> &outputs) {
+  //LOG(INFO) << "example_id\tcontradiction\tentailment\tneutral";
+  for (size_t i = 0; i < outputs.front().data.length() / sizeof(float) / 3; i += 1) {
+    std::cout << static_cast<float *>(outputs[0].data.data())[3 * i] << "\t"
+         << static_cast<float *>(outputs[0].data.data())[3 * i + 1] << "\t"
+         << static_cast<float *>(outputs[0].data.data())[3 * i + 2] << std::endl;
+  }
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
+  if (FLAGS_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_data);
+  std::string line;
+
+  int lineno = 0;
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    if (!ParseLine(line, &feed_data)) {
+      LOG(ERROR) << "Parse line[" << lineno << "] error!";
+    } else {
+      inputs->push_back(std::move(feed_data));
+    }
+  }
+
+  return true;
+}
+
+// ernie inference demo
+// Options:
+//     --model_dir: ernie model file directory
+//     --data: data path
+//     --repeat: repeat num
+//     --use_gpu: use gpu
+int main(int argc, char *argv[]) {
+  google::InitGoogleLogging(*argv);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_model_dir.empty()) {
+    LOG(ERROR) << "please set model dir";
+    return -1;
+  }
+
+  paddle::AnalysisConfig config;
+  config.SetModel(FLAGS_model_dir);
+  config.EnableUseGpu(100, 0);
+  config.SwitchSpecifyInputNames(true);
+  config.EnableCUDNN();
+  config.SwitchIrOptim(true);
+  config.EnableMemoryOptim();
+  auto predictor = CreatePaddlePredictor(config);
+
+  std::vector<std::vector<paddle::PaddleTensor>> inputs;
+  if (!LoadInputData(&inputs)) {
+    LOG(ERROR) << "load input data error!";
+    return -1;
+  }
+
+  std::vector<paddle::PaddleTensor> fetch;
+  int total_time{0};
+  // auto predict_timer = []()
+  int num_samples{0};
+  int count{0};
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    for (auto feed : inputs) {
+      fetch.clear();
+      auto start = std::chrono::system_clock::now();
+      predictor->Run(feed, &fetch);
+      if (FLAGS_output_prediction && i == 0) {
+        PrintOutputs(fetch);
+      }
+      auto end = std::chrono::system_clock::now();
+      count += 1;
+      if (!fetch.empty()) {
+        total_time +=
+            std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+                .count();
+        //num_samples += fetch.front().data.length() / 2 / sizeof(float);
+        num_samples += fetch.front().data.length() / (sizeof(float) * 2);
+      }
+    }
+  }
+
+  auto per_sample_ms =
+      static_cast<float>(total_time) / num_samples;
+  LOG(INFO) << "Run " << num_samples
+            << " samples, average latency: " << per_sample_ms
+            << "ms per sample.";
+  LOG(INFO) << count;
+
+  return 0;
+}
--- a/inference/gpu/run.sh
+++ b/inference/gpu/run.sh
+set -x
+(($# != 2)) && echo "${0} data model" && exit -1
+
+export LD_LIBRARY_PATH=fluid_inference/third_party/install/mkldnn/lib:fluid_inference/third_party/install/mklml/lib:fluid_inference/paddle/lib/:/home/work/cuda-9.0/lib64/:/home/work/cudnn/cudnn_v7_3_1_cuda9.0/lib64/:$LD_LIBRARY_PATH
+
+./build/inference --logtostderr \
+    --model_dir $2 \
+    --data $1 \
+    --repeat 5 \
+    --output_prediction true \
+    --use_gpu true \
+    --device 0 \