提交 722d9581 编写于 作者: C chenxuyi 提交者: Meiyim

+ fast inference

上级 815925c5
......@@ -1056,7 +1056,7 @@ ERNIE提供了通过数据蒸馏从而达到模型压缩、加速的开发套件
如果您采用 `propeller` 完成finetune,则 `BestInferenceExporter` 会在finetune过程中根据预测指标,挑最好的模型生成 inference_model .
### 在线预测
随后您可以使用[PaddleInference C++ API](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_usage/deploy/inference/native_infer.html)将模型的前向预测代码联编到您的生产环境中。或者您可以使用我们为您构建好的python预测引擎来完成一个简单的服务。执行如下指令,便可以开启一个propeller server:
随后您可以使用[ERNIE fast inference C++ API](./inference/README.md)将模型的前向预测代码联编到您的生产环境中。或者您可以使用我们为您构建好的python预测引擎来完成一个简单的服务。执行如下指令,便可以开启一个propeller server:
```script
python -m propeller.tools.start_server -m /path/to/saved/model -p 8888
......
......@@ -40,12 +40,12 @@ def create_model(args,
is_regression=False,
ernie_version="1.0"):
src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
sent_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
pos_ids = fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
task_ids = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64')
input_mask = fluid.layers.data(name='5', shape=[-1, args.max_seq_len, 1], dtype='float32')
qids = fluid.layers.data(name='7', shape=[-1, 1], dtype='int64')
src_ids = fluid.layers.data(name='eval_placeholder_0', shape=[-1, args.max_seq_len, 1], dtype='int64')
sent_ids = fluid.layers.data(name='eval_placeholder_1', shape=[-1, args.max_seq_len, 1], dtype='int64')
pos_ids = fluid.layers.data(name='eval_placeholder_2', shape=[-1, args.max_seq_len, 1], dtype='int64')
input_mask = fluid.layers.data(name='eval_placeholder_3', shape=[-1, args.max_seq_len, 1], dtype='float32')
task_ids = fluid.layers.data(name='eval_placeholder_4', shape=[-1, args.max_seq_len, 1], dtype='int64')
qids = fluid.layers.data(name='eval_placeholder_5', shape=[-1, 1], dtype='int64')
if is_classify:
labels = fluid.layers.data(name='6', shape=[-1, 1], dtype='int64')
......@@ -87,7 +87,7 @@ def create_model(args,
else:
probs = logits
feed_targets_name = [
src_ids.name, sent_ids.name, pos_ids.name, input_mask.name
src_ids.name, sent_ids.name, pos_ids.name, input_mask.name
]
if ernie_version == "2.0":
feed_targets_name += [task_ids.name]
......
# ERNIE fast inference (C++)
ERNIE C++ fast inference API提供了一种更为高效的在线预测方案,可以直接联编译至生产环境以获取更好的性能。
其实现基于[fluid inference](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_usage/deploy/inference/native_infer.html).
**请确保您的 fluid inference 版本高于 1.6.3 以获得正确的预测结果。**
本页面提供了一个ERNIE C++ fast inference 的 demo benchmark.
## 准备工作
demo 数据取自XNLI数据集test集合,位于./data 中。采用明文id格式,一行代表一个 batch, 包含四个字段:
```text
src_ids, pos_ids, sent_ids, self_attn_mask
```
字段之间按照分号(;)分隔;各字段内部包含 `shape``data` 两部分,按照冒号(:)分隔; `shape``data` 内部按空格分隔;`self_attn_mask` 为 FLOAT32 类型,其余字段为 INT64 类型。
ERNIE fast inference 需要输入 inference\_model 格式的模型,可以参考[这里](../README.zh.md#生成inference_model)生成 inference\_model .
**使用propeller产出的 inference\_model 只需要`src_ids`,`sent_ids` 两个字段,因此需要适当修改数据文件**
## 编译和运行
为了编译本 demo,c++ 编译器需要支持 C++11 标准。
下载对应的 [fluid_inference库](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_usage/deploy/inference/build_and_install_lib_cn.html) , 根据使用的 paddle 的版本和配置状况 (是否使用 avx, mkl, 以及 cuda, cudnn 版本) 选择下载对应的版本并解压,会得到 `fluid_inference` 文件夹,将其放在与`inference.cc`同一级目录。
用以下命令编译:
``` bash
cd ./gpu # cd ./cpu
mkdir build
cd build
cmake ..
make
```
用以下命令运行:
```
./run.sh ../data/sample /path/to/inference_mode_dir
```
## 性能测试
测试样本:XNLI test集合,输入BatchSize=1, SequenceLength=128.
重复5遍取平均值。
| mode | 延迟(ms) |
| ----- | ----- |
| CPU(Intel(R) Xeon(R) Gold 5117 CPU @ 2.00GHz (20 线程)) | 8.5 |
| GPU (P4) | 29.8818 |
CMAKE_MINIMUM_REQUIRED(VERSION 3.2)
PROJECT(inference_demo)
SET(CMAKE_C_COMPILER gcc)
SET(CMAKE_CXX_COMPILER g++)
ADD_COMPILE_OPTIONS(-std=c++11 -g)
SET(FLUID_INFER_LIB fluid_inference)
SET(FLUID_INC_PATH ${FLUID_INFER_LIB}/paddle/include)
SET(FLUID_LIB_PATH ${FLUID_INFER_LIB}/paddle/lib)
SET(GLOG_INC_PATH ${FLUID_INFER_LIB}/third_party/install/glog/include)
SET(GLOG_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/glog/lib)
SET(GFLAGS_INC_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/include)
SET(GFLAGS_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/lib)
SET(MKLDNN_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mkldnn/lib)
SET(MKLML_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mklml/lib)
INCLUDE_DIRECTORIES(${FLUID_INC_PATH})
INCLUDE_DIRECTORIES(${GLOG_INC_PATH})
INCLUDE_DIRECTORIES(${GFLAGS_INC_PATH})
LINK_DIRECTORIES(${FLUID_LIB_PATH})
LINK_DIRECTORIES(${GLOG_LIB_PATH})
LINK_DIRECTORIES(${GFLAGS_LIB_PATH})
LINK_DIRECTORIES(${MKLML_LIB_PATH})
LINK_DIRECTORIES(${MKLDNN_LIB_PATH})
ADD_EXECUTABLE(inference inference.cc)
TARGET_LINK_LIBRARIES(inference dl paddle_fluid glog gflags pthread)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <paddle_inference_api.h>
#include <chrono>
#include <fstream>
#include <iostream>
#include <numeric>
#include <sstream>
#include <string>
#include <vector>
DEFINE_string(model_dir, "", "model directory");
DEFINE_string(data, "", "input data path");
DEFINE_int32(repeat, 1, "repeat");
DEFINE_bool(output_prediction, false, "Whether to output the prediction results.");
DEFINE_bool(use_gpu, false, "Whether to use GPU for prediction.");
DEFINE_int32(device, 0, "device.");
template <typename T>
void GetValueFromStream(std::stringstream *ss, T *t) {
(*ss) >> (*t);
}
template <>
void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
*t = ss->str();
}
// Split string to vector
template <typename T>
void Split(const std::string &line, char sep, std::vector<T> *v) {
std::stringstream ss;
T t;
for (auto c : line) {
if (c != sep) {
ss << c;
} else {
GetValueFromStream<T>(&ss, &t);
v->push_back(std::move(t));
ss.str({});
ss.clear();
}
}
if (!ss.str().empty()) {
GetValueFromStream<T>(&ss, &t);
v->push_back(std::move(t));
ss.str({});
ss.clear();
}
}
template <typename T>
constexpr paddle::PaddleDType GetPaddleDType();
template <>
constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
return paddle::PaddleDType::INT64;
}
template <>
constexpr paddle::PaddleDType GetPaddleDType<float>() {
return paddle::PaddleDType::FLOAT32;
}
// Parse tensor from string
template <typename T>
bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
std::vector<std::string> data;
Split(field, ':', &data);
if (data.size() < 2) return false;
std::string shape_str = data[0];
std::vector<int> shape;
Split(shape_str, ' ', &shape);
std::string mat_str = data[1];
std::vector<T> mat;
Split(mat_str, ' ', &mat);
tensor->shape = shape;
auto size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
sizeof(T);
tensor->data.Resize(size);
std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
tensor->dtype = GetPaddleDType<T>();
return true;
}
// Parse input tensors from string
bool ParseLine(const std::string &line,
std::vector<paddle::PaddleTensor> *tensors) {
std::vector<std::string> fields;
Split(line, ';', &fields);
if (fields.size() <= 2) return false;
tensors->clear();
tensors->reserve(4);
int i = 0;
// src_ids
paddle::PaddleTensor src_ids;
ParseTensor<int64_t>(fields[i++], &src_ids);
src_ids.name = "eval_placeholder_0";
tensors->push_back(src_ids);
// sent_ids
paddle::PaddleTensor sent_ids;
ParseTensor<int64_t>(fields[i++], &sent_ids);
sent_ids.name = "eval_placeholder_1";
tensors->push_back(sent_ids);
// pos_ids
paddle::PaddleTensor pos_ids;
ParseTensor<int64_t>(fields[i++], &pos_ids);
pos_ids.name = "eval_placeholder_2";
tensors->push_back(pos_ids);
// input_mask
paddle::PaddleTensor input_mask;
ParseTensor<float>(fields[i++], &input_mask);
input_mask.name = "eval_placeholder_3";
tensors->push_back(input_mask);
return true;
}
// Print outputs to log
void PrintOutputs(const std::vector<paddle::PaddleTensor> &outputs) {
//LOG(INFO) << "example_id\tcontradiction\tentailment\tneutral";
for (size_t i = 0; i < outputs.front().data.length() / sizeof(float) / 3; i += 1) {
std::cout << static_cast<float *>(outputs[0].data.data())[3 * i] << "\t"
<< static_cast<float *>(outputs[0].data.data())[3 * i + 1] << "\t"
<< static_cast<float *>(outputs[0].data.data())[3 * i + 2] << std::endl;
}
}
bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
if (FLAGS_data.empty()) {
LOG(ERROR) << "please set input data path";
return false;
}
std::ifstream fin(FLAGS_data);
std::string line;
int lineno = 0;
while (std::getline(fin, line)) {
std::vector<paddle::PaddleTensor> feed_data;
if (!ParseLine(line, &feed_data)) {
LOG(ERROR) << "Parse line[" << lineno << "] error!";
} else {
inputs->push_back(std::move(feed_data));
}
}
return true;
}
// ernie inference demo
// Options:
// --model_dir: ernie model file directory
// --data: data path
// --repeat: repeat num
// --use_gpu: use gpu
int main(int argc, char *argv[]) {
google::InitGoogleLogging(*argv);
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir.empty()) {
LOG(ERROR) << "please set model dir";
return -1;
}
paddle::AnalysisConfig config;
config.SetModel(FLAGS_model_dir);
config.DisableGpu();
config.SwitchIrOptim();
config.EnableMKLDNN();
config.SetCpuMathLibraryNumThreads(20);
//config.EnableMemoryOptim();
auto predictor = CreatePaddlePredictor(config);
std::vector<std::vector<paddle::PaddleTensor>> inputs;
if (!LoadInputData(&inputs)) {
LOG(ERROR) << "load input data error!";
return -1;
}
std::vector<paddle::PaddleTensor> fetch;
int total_time{0};
// auto predict_timer = []()
int num_samples{0};
int count{0};
for (int i = 0; i < FLAGS_repeat; i++) {
for (auto feed : inputs) {
fetch.clear();
auto start = std::chrono::system_clock::now();
predictor->Run(feed, &fetch);
if (FLAGS_output_prediction && i == 0) {
PrintOutputs(fetch);
}
auto end = std::chrono::system_clock::now();
count += 1;
if (!fetch.empty()) {
total_time +=
std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
//num_samples += fetch.front().data.length() / 2 / sizeof(float);
num_samples += fetch.front().data.length() / (sizeof(float) * 2);
}
}
}
auto per_sample_ms =
static_cast<float>(total_time) / num_samples;
LOG(INFO) << "Run " << num_samples
<< " samples, average latency: " << per_sample_ms
<< "ms per sample.";
LOG(INFO) << count;
return 0;
}
set -x
(($# != 2)) && echo "${0} data model" && exit -1
export LD_LIBRARY_PATH=fluid_inference/third_party/install/mkldnn/lib:fluid_inference/third_party/install/mklml/lib:fluid_inference/paddle/lib/:/home/work/cuda-9.0/lib64/:/home/work/cudnn/cudnn_v7_3_1_cuda9.0/lib64/:$LD_LIBRARY_PATH \
./build/inference --logtostderr \
--model_dir $2 \
--data $1 \
--repeat 5 \
--output_prediction true \
--use_gpu true \
--device 0 \
此差异已折叠。
CMAKE_MINIMUM_REQUIRED(VERSION 3.2)
PROJECT(inference_demo)
SET(CMAKE_C_COMPILER gcc)
SET(CMAKE_CXX_COMPILER g++)
ADD_COMPILE_OPTIONS(-std=c++11 -g)
SET(FLUID_INFER_LIB fluid_inference)
SET(FLUID_INC_PATH ${FLUID_INFER_LIB}/paddle/include)
SET(FLUID_LIB_PATH ${FLUID_INFER_LIB}/paddle/lib)
SET(GLOG_INC_PATH ${FLUID_INFER_LIB}/third_party/install/glog/include)
SET(GLOG_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/glog/lib)
SET(GFLAGS_INC_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/include)
SET(GFLAGS_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/lib)
SET(MKLDNN_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mkldnn/lib)
SET(MKLML_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mklml/lib)
INCLUDE_DIRECTORIES(${FLUID_INC_PATH})
INCLUDE_DIRECTORIES(${GLOG_INC_PATH})
INCLUDE_DIRECTORIES(${GFLAGS_INC_PATH})
LINK_DIRECTORIES(${FLUID_LIB_PATH})
LINK_DIRECTORIES(${GLOG_LIB_PATH})
LINK_DIRECTORIES(${GFLAGS_LIB_PATH})
LINK_DIRECTORIES(${MKLML_LIB_PATH})
LINK_DIRECTORIES(${MKLDNN_LIB_PATH})
ADD_EXECUTABLE(inference inference.cc)
TARGET_LINK_LIBRARIES(inference dl paddle_fluid glog gflags pthread)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <paddle_inference_api.h>
#include <chrono>
#include <fstream>
#include <iostream>
#include <numeric>
#include <sstream>
#include <string>
#include <vector>
DEFINE_string(model_dir, "", "model directory");
DEFINE_string(data, "", "input data path");
DEFINE_int32(repeat, 1, "repeat");
DEFINE_bool(output_prediction, false, "Whether to output the prediction results.");
DEFINE_bool(use_gpu, false, "Whether to use GPU for prediction.");
DEFINE_int32(device, 0, "device.");
template <typename T>
void GetValueFromStream(std::stringstream *ss, T *t) {
(*ss) >> (*t);
}
template <>
void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
*t = ss->str();
}
// Split string to vector
template <typename T>
void Split(const std::string &line, char sep, std::vector<T> *v) {
std::stringstream ss;
T t;
for (auto c : line) {
if (c != sep) {
ss << c;
} else {
GetValueFromStream<T>(&ss, &t);
v->push_back(std::move(t));
ss.str({});
ss.clear();
}
}
if (!ss.str().empty()) {
GetValueFromStream<T>(&ss, &t);
v->push_back(std::move(t));
ss.str({});
ss.clear();
}
}
template <typename T>
constexpr paddle::PaddleDType GetPaddleDType();
template <>
constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
return paddle::PaddleDType::INT64;
}
template <>
constexpr paddle::PaddleDType GetPaddleDType<float>() {
return paddle::PaddleDType::FLOAT32;
}
// Parse tensor from string
template <typename T>
bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
std::vector<std::string> data;
Split(field, ':', &data);
if (data.size() < 2) return false;
std::string shape_str = data[0];
std::vector<int> shape;
Split(shape_str, ' ', &shape);
std::string mat_str = data[1];
std::vector<T> mat;
Split(mat_str, ' ', &mat);
tensor->shape = shape;
auto size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
sizeof(T);
tensor->data.Resize(size);
std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
tensor->dtype = GetPaddleDType<T>();
return true;
}
// Parse input tensors from string
bool ParseLine(const std::string &line,
std::vector<paddle::PaddleTensor> *tensors) {
std::vector<std::string> fields;
Split(line, ';', &fields);
if (fields.size() <= 2) return false;
tensors->clear();
tensors->reserve(4);
int i = 0;
paddle::PaddleTensor src_ids;
ParseTensor<int64_t>(fields[i++], &src_ids);
src_ids.name = "eval_placeholder_0";
tensors->push_back(src_ids);
// sent_ids
paddle::PaddleTensor sent_ids;
ParseTensor<int64_t>(fields[i++], &sent_ids);
sent_ids.name = "eval_placeholder_1";
tensors->push_back(sent_ids);
// pos_ids
paddle::PaddleTensor pos_ids;
ParseTensor<int64_t>(fields[i++], &pos_ids);
pos_ids.name = "eval_placeholder_2";
tensors->push_back(pos_ids);
// input_mask
paddle::PaddleTensor input_mask;
ParseTensor<float>(fields[i++], &input_mask);
input_mask.name = "eval_placeholder_3";
tensors->push_back(input_mask);
return true;
}
// Print outputs to log
void PrintOutputs(const std::vector<paddle::PaddleTensor> &outputs) {
//LOG(INFO) << "example_id\tcontradiction\tentailment\tneutral";
for (size_t i = 0; i < outputs.front().data.length() / sizeof(float) / 3; i += 1) {
std::cout << static_cast<float *>(outputs[0].data.data())[3 * i] << "\t"
<< static_cast<float *>(outputs[0].data.data())[3 * i + 1] << "\t"
<< static_cast<float *>(outputs[0].data.data())[3 * i + 2] << std::endl;
}
}
bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
if (FLAGS_data.empty()) {
LOG(ERROR) << "please set input data path";
return false;
}
std::ifstream fin(FLAGS_data);
std::string line;
int lineno = 0;
while (std::getline(fin, line)) {
std::vector<paddle::PaddleTensor> feed_data;
if (!ParseLine(line, &feed_data)) {
LOG(ERROR) << "Parse line[" << lineno << "] error!";
} else {
inputs->push_back(std::move(feed_data));
}
}
return true;
}
// ernie inference demo
// Options:
// --model_dir: ernie model file directory
// --data: data path
// --repeat: repeat num
// --use_gpu: use gpu
int main(int argc, char *argv[]) {
google::InitGoogleLogging(*argv);
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir.empty()) {
LOG(ERROR) << "please set model dir";
return -1;
}
paddle::AnalysisConfig config;
config.SetModel(FLAGS_model_dir);
config.EnableUseGpu(100, 0);
config.SwitchSpecifyInputNames(true);
config.EnableCUDNN();
config.SwitchIrOptim(true);
config.EnableMemoryOptim();
auto predictor = CreatePaddlePredictor(config);
std::vector<std::vector<paddle::PaddleTensor>> inputs;
if (!LoadInputData(&inputs)) {
LOG(ERROR) << "load input data error!";
return -1;
}
std::vector<paddle::PaddleTensor> fetch;
int total_time{0};
// auto predict_timer = []()
int num_samples{0};
int count{0};
for (int i = 0; i < FLAGS_repeat; i++) {
for (auto feed : inputs) {
fetch.clear();
auto start = std::chrono::system_clock::now();
predictor->Run(feed, &fetch);
if (FLAGS_output_prediction && i == 0) {
PrintOutputs(fetch);
}
auto end = std::chrono::system_clock::now();
count += 1;
if (!fetch.empty()) {
total_time +=
std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
//num_samples += fetch.front().data.length() / 2 / sizeof(float);
num_samples += fetch.front().data.length() / (sizeof(float) * 2);
}
}
}
auto per_sample_ms =
static_cast<float>(total_time) / num_samples;
LOG(INFO) << "Run " << num_samples
<< " samples, average latency: " << per_sample_ms
<< "ms per sample.";
LOG(INFO) << count;
return 0;
}
set -x
(($# != 2)) && echo "${0} data model" && exit -1
export LD_LIBRARY_PATH=fluid_inference/third_party/install/mkldnn/lib:fluid_inference/third_party/install/mklml/lib:fluid_inference/paddle/lib/:/home/work/cuda-9.0/lib64/:/home/work/cudnn/cudnn_v7_3_1_cuda9.0/lib64/:$LD_LIBRARY_PATH
./build/inference --logtostderr \
--model_dir $2 \
--data $1 \
--repeat 5 \
--output_prediction true \
--use_gpu true \
--device 0 \
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册