提交 aeef92ab 编写于 作者: J Jiawei Wang 提交者: GitHub

Merge pull request #2 from PaddlePaddle/develop

merge with origin
language: generic
sudo: required
dist: trusty
os:
- linux
env:
- COMPILE_TYPE=CPU DOCKERFILE_CPU=$PWD/tools/Dockerfile.ci
services:
- docker
before_install:
- docker build -f ${DOCKERFILE_CPU} -t serving-img:${COMPILE_TYPE} .
install:
- if [ $COMPILE_TYPE == "CPU" ]; then docker run -it -v $PWD:/Serving serving-img:${COMPILE_TYPE} /bin/bash Serving/tools/serving_check_style.sh ; fi;
- docker run -it -v $PWD:/Serving serving-img:${COMPILE_TYPE} /bin/bash Serving/tools/serving_build.sh $COMPILE_TYPE
......@@ -49,8 +49,11 @@ set(THIRD_PARTY_BUILD_TYPE Release)
option(WITH_AVX "Compile Paddle Serving with AVX intrinsics" OFF)
option(WITH_MKL "Compile Paddle Serving with MKL support." OFF)
option(WITH_GPU "Compile Paddle Serving with NVIDIA GPU" OFF)
option(CLIENT_ONLY "Compile client libraries and demos only" OFF)
option(CLIENT "Compile Paddle Serving Client" OFF)
option(SERVER "Compile Paddle Serving Server" OFF)
option(APP "Compile Paddle Serving App package" OFF)
option(WITH_ELASTIC_CTR "Compile ELASITC-CTR solution" OFF)
option(PACK "Compile for whl" OFF)
set(WITH_MKLML ${WITH_MKL})
if (NOT DEFINED WITH_MKLDNN)
......@@ -62,12 +65,12 @@ if (NOT DEFINED WITH_MKLDNN)
endif()
endif()
if (NOT CLIENT_ONLY)
if (SERVER)
include(external/jsoncpp)
include(external/rocksdb)
#include(external/rocksdb)
endif()
#include(external/gtest)
if (SERVER OR CLIENT)
include(external/snappy)
include(external/leveldb)
include(external/zlib)
......@@ -80,8 +83,9 @@ include(external/pybind11)
include(external/python)
include(generic)
include(flags)
endif()
if (NOT CLIENT_ONLY)
if (SERVER)
include(external/cudnn)
include(paddlepaddle)
endif()
......@@ -90,7 +94,7 @@ message("paddle serving source dir: " ${PADDLE_SERVING_SOURCE_DIR})
include_directories(${PADDLE_SERVING_SOURCE_DIR})
include_directories(${PADDLE_SERVING_BINARY_DIR})
if(NOT CLIENT_ONLY)
if(SERVER)
set(EXTERNAL_LIBS
jsoncpp
gflags
......@@ -108,28 +112,27 @@ set(EXTERNAL_LIBS
brpc
)
if(NOT CLIENT_ONLY)
if(SERVER)
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
endif()
if(NOT CLIENT_ONLY)
if(SERVER)
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
endif()
if (NOT CLIENT_ONLY)
if (SERVER)
list(APPEND EXTERNAL_LIBS paddlepaddle)
endif()
add_subdirectory(core)
if(NOT CLIENT_ONLY)
if(SERVER)
add_subdirectory(paddle_inference)
endif()
add_subdirectory(python)
#add_subdirectory(examples)
<img src='https://paddle-serving.bj.bcebos.com/imdb-demo%2FLogoMakr-3Bd2NM-300dpi.png' width = "600" height = "127">
<p align="center">
<br>
<img src='https://paddle-serving.bj.bcebos.com/imdb-demo%2FLogoMakr-3Bd2NM-300dpi.png' width = "600" height = "130">
<br>
<p>
<p align="center">
<br>
<a href="https://travis-ci.com/PaddlePaddle/Serving">
<img alt="Build Status" src="https://img.shields.io/travis/com/PaddlePaddle/Serving/develop">
</a>
<img alt="Release" src="https://img.shields.io/badge/Release-0.0.3-yellowgreen">
<img alt="Issues" src="https://img.shields.io/github/issues/PaddlePaddle/Serving">
<img alt="License" src="https://img.shields.io/github/license/PaddlePaddle/Serving">
<img alt="Slack" src="https://img.shields.io/badge/Join-Slack-green">
<br>
<p>
<h2 align="center">Motivation</h2>
We consider deploying deep learning inference service online to be a user-facing application in the future. **The goal of this project**: When you have trained a deep neural net with [Paddle](https://github.com/PaddlePaddle/Paddle), you can put the model online without much effort. A demo of serving is as follows:
<p align="center">
<img src="doc/demo.gif" width="700">
</p>
<h2 align="center">Some Key Features</h2>
[![Release](https://img.shields.io/badge/Release-0.0.3-yellowgreen)](Release)
[![Issues](https://img.shields.io/github/issues/PaddlePaddle/Serving)](Issues)
[![License](https://img.shields.io/github/license/PaddlePaddle/Serving)](LICENSE)
[![Slack](https://img.shields.io/badge/Join-Slack-green)](https://paddleserving.slack.com/archives/CU0PB4K35)
## Motivation
Paddle Serving helps deep learning developers deploy an online inference service without much effort. **The goal of this project**: once you have trained a deep neural nets with [Paddle](https://github.com/PaddlePaddle/Paddle), you already have a model inference service.
## Key Features
- Integrate with Paddle training pipeline seemlessly, most paddle models can be deployed **with one line command**.
- **Industrial serving features** supported, such as models management, online loading, online A/B testing etc.
- **Distributed Key-Value indexing** supported that is especially useful for large scale sparse features as model inputs.
......@@ -17,24 +32,56 @@ Paddle Serving helps deep learning developers deploy an online inference service
- **Multiple programming languages** supported on client side, such as Golang, C++ and python
- **Extensible framework design** that can support model serving beyond Paddle.
## Installation
<h2 align="center">Installation</h2>
We highly recommend you to run Paddle Serving in Docker, please visit [Run in Docker](https://github.com/PaddlePaddle/Serving/blob/develop/doc/RUN_IN_DOCKER.md)
```shell
pip install paddle-serving-client
pip install paddle-serving-server
```
## Quick Start Example
<h2 align="center">Quick Start Example</h2>
### Boston House Price Prediction model
``` shell
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
```
Python Client Request
Paddle Serving provides HTTP and RPC based service for users to access
### HTTP service
Paddle Serving provides a built-in python module called `paddle_serving_server.serve` that can start a rpc service or a http service with one-line command. If we specify the argument `--name uci`, it means that we will have a HTTP service with a url of `$IP:$PORT/uci/prediction`
``` shell
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci
```
<center>
| Argument | Type | Default | Description |
|--------------|------|-----------|--------------------------------|
| `thread` | int | `4` | Concurrency of current service |
| `port` | int | `9292` | Exposed port of current service to users|
| `name` | str | `""` | Service name, can be used to generate HTTP request url |
| `model` | str | `""` | Path of paddle model directory to be served |
Here, we use `curl` to send a HTTP POST request to the service we just started. Users can use any python library to send HTTP POST as well, e.g, [requests](https://requests.readthedocs.io/en/master/).
</center>
``` shell
curl -H "Content-Type:application/json" -X POST -d '{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332], "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
```
### RPC service
A user can also start a rpc service with `paddle_serving_server.serve`. RPC service is usually faster than HTTP service, although a user needs to do some coding based on Paddle Serving's python client API. Note that we do not specify `--name` here.
``` shell
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
```
``` python
# A user can visit rpc service through paddle_serving_client API
from paddle_serving_client import Client
client = Client()
......@@ -45,27 +92,111 @@ data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
fetch_map = client.predict(feed={"x": data}, fetch=["price"])
print(fetch_map)
```
Here, `client.predict` function has two arguments. `feed` is a `python dict` with model input variable alias name and values. `fetch` assigns the prediction variables to be returned from servers. In the example, the name of `"x"` and `"price"` are assigned when the servable model is saved during training.
<h2 align="center"> Pre-built services with Paddle Serving</h2>
<h3 align="center">Chinese Word Segmentation</h4>
- **Description**:
``` shell
Chinese word segmentation HTTP service that can be deployed with one line command.
```
- **Download Servable Package**:
``` shell
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/lac/lac_model_jieba_web.tar.gz
```
- **Host web service**:
``` shell
tar -xzf lac_model_jieba_web.tar.gz
python lac_web_service.py jieba_server_model/ lac_workdir 9292
```
- **Request sample**:
``` shell
curl -H "Content-Type:application/json" -X POST -d '{"words": "我爱北京天安门", "fetch":["word_seg"]}' http://127.0.0.1:9292/lac/prediction
```
- **Request result**:
``` shell
{"word_seg":"我|爱|北京|天安门"}
```
<h3 align="center">Image Classification</h4>
- **Description**:
``` shell
Image classification trained with Imagenet dataset. A label and corresponding probability will be returned.
```
- **Download Servable Package**:
``` shell
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/imagenet-example/imagenet_demo.tar.gz
```
- **Host web service**:
``` shell
tar -xzf imagenet_demo.tar.gz
python image_classification_service_demo.py resnet50_serving_model
```
- **Request sample**:
<p align="center">
<br>
<img src='https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg' width = "200" height = "200">
<br>
<p>
``` shell
curl -H "Content-Type:application/json" -X POST -d '{"url": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg", "fetch": ["score"]}' http://127.0.0.1:9292/image/prediction
```
- **Request result**:
``` shell
{"label":"daisy","prob":0.9341403245925903}
```
## Document
<h2 align="center">Document</h2>
[Design Doc(Chinese)](doc/DESIGN.md)
### New to Paddle Serving
- [How to save a servable model?](doc/SAVE.md)
- [An end-to-end tutorial from training to serving](doc/END_TO_END.md)
- [Write Bert-as-Service in 10 minutes](doc/Bert_10_mins.md)
[How to config Serving native operators on server side?](doc/SERVER_DAG.md)
### Developers
- [How to config Serving native operators on server side?](doc/SERVER_DAG.md)
- [How to develop a new Serving operator](doc/NEW_OPERATOR.md)
- [Golang client](doc/IMDB_GO_CLIENT.md)
- [Compile from source code(Chinese)](doc/COMPILE.md)
[How to develop a new Serving operator](doc/NEW_OPERATOR.md)
### About Efficiency
- [How profile serving efficiency?(Chinese)](https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/util)
- [Benchmarks](doc/BENCHMARK.md)
[Golang client](doc/IMDB_GO_CLIENT.md)
### FAQ
- [FAQ(Chinese)](doc/FAQ.md)
[Compile from source code(Chinese)](doc/COMPILE.md)
[FAQ(Chinese)](doc/FAQ.md)
### Design
- [Design Doc(Chinese)](doc/DESIGN_DOC.md)
- [Design Doc(English)](doc/DESIGN_DOC_EN.md)
<h2 align="center">Community</h2>
### Slack
## Join Community
To connect with other users and contributors, welcome to join our [Slack channel](https://paddleserving.slack.com/archives/CUBPKHKMJ)
## Contribution
### Contribution
If you want to contribute code to Paddle Serving, please reference [Contribution Guidelines](doc/CONTRIBUTE.md)
### Feedback
For any feedback or to report a bug, please propose a [GitHub Issue](https://github.com/PaddlePaddle/Serving/issues).
### License
[Apache 2.0 License](https://github.com/PaddlePaddle/Serving/blob/develop/LICENSE)
<img src='https://paddle-serving.bj.bcebos.com/imdb-demo%2FLogoMakr-3Bd2NM-300dpi.png' width = "600" height = "127">
[![Build Status](https://img.shields.io/travis/com/PaddlePaddle/Serving/develop)](https://travis-ci.com/PaddlePaddle/Serving)
[![Release](https://img.shields.io/badge/Release-0.0.3-yellowgreen)](Release)
[![Issues](https://img.shields.io/github/issues/PaddlePaddle/Serving)](Issues)
[![License](https://img.shields.io/github/license/PaddlePaddle/Serving)](LICENSE)
[![Slack](https://img.shields.io/badge/Join-Slack-green)](https://paddleserving.slack.com/archives/CU0PB4K35)
## 动机
Paddle Serving 帮助深度学习开发者轻易部署在线预测服务。 **本项目目标**: 只要你使用 [Paddle](https://github.com/PaddlePaddle/Paddle) 训练了一个深度神经网络,你就同时拥有了该模型的预测服务。
<p align="center">
<img src="doc/demo.gif" width="700">
</p>
## 核心功能
- 与Paddle训练紧密连接,绝大部分Paddle模型可以 **一键部署**.
- 支持 **工业级的服务能力** 例如模型管理,在线加载,在线A/B测试等.
- 支持 **分布式键值对索引** 助力于大规模稀疏特征作为模型输入.
- 支持客户端和服务端之间 **高并发和高效通信**.
- 支持 **多种编程语言** 开发客户端,例如Golang,C++和Python.
- **可伸缩框架设计** 可支持不限于Paddle的模型服务.
## 安装
强烈建议您在Docker内构建Paddle Serving,请查看[如何在Docker中运行PaddleServing](doc/RUN_IN_DOCKER_CN.md)
```shell
pip install paddle-serving-client
pip install paddle-serving-server
```
## 快速启动示例
``` shell
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
```
Python客户端请求
``` python
from paddle_serving_client import Client
client = Client()
client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
client.connect(["127.0.0.1:9292"])
data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
fetch_map = client.predict(feed={"x": data}, fetch=["price"])
print(fetch_map)
```
## 文档
[开发文档](doc/DESIGN.md)
[如何在服务器端配置本地Op?](doc/SERVER_DAG.md)
[如何开发一个新的Op?](doc/NEW_OPERATOR.md)
[Golang 客户端](doc/IMDB_GO_CLIENT.md)
[从源码编译](doc/COMPILE.md)
[常见问答](doc/FAQ.md)
## 加入社区
如果您想要联系其他用户和开发者,欢迎加入我们的 [Slack channel](https://paddleserving.slack.com/archives/CUBPKHKMJ)
## 如何贡献代码
如果您想要贡献代码给Paddle Serving,请参考[Contribution Guidelines](doc/CONTRIBUTE.md)
......@@ -31,7 +31,7 @@ message( "WITH_GPU = ${WITH_GPU}")
# Paddle Version should be one of:
# latest: latest develop build
# version number like 1.5.2
SET(PADDLE_VERSION "latest")
SET(PADDLE_VERSION "1.7.1")
if (WITH_GPU)
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda${CUDA_VERSION_MAJOR}-cudnn7-avx-mkl")
......
......@@ -12,19 +12,26 @@
# See the License for the specific language governing permissions and
# limitations under the License
if(NOT CLIENT_ONLY)
if(SERVER)
add_subdirectory(cube)
add_subdirectory(kvdb)
#add_subdirectory(kvdb)
endif()
if (CLIENT OR SERVER)
add_subdirectory(configure)
add_subdirectory(pdcodegen)
add_subdirectory(sdk-cpp)
if(CLIENT_ONLY)
endif()
if(CLIENT)
add_subdirectory(general-client)
endif()
if (NOT CLIENT_ONLY)
if (SERVER)
add_subdirectory(predictor)
add_subdirectory(general-server)
endif()
if (CLIENT OR SERVER)
add_subdirectory(util)
endif()
......@@ -33,7 +33,7 @@ py_proto_compile(general_model_config_py_proto SRCS proto/general_model_config.p
add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
if (CLIENT_ONLY)
if (CLIENT)
py_proto_compile(sdk_configure_py_proto SRCS proto/sdk_configure.proto)
add_custom_target(sdk_configure_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(sdk_configure_py_proto sdk_configure_py_proto_init)
......@@ -51,7 +51,7 @@ add_custom_command(TARGET general_model_config_py_proto POST_BUILD
endif()
if (NOT CLIENT_ONLY)
if (SERVER)
py_proto_compile(server_config_py_proto SRCS proto/server_configure.proto)
add_custom_target(server_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(server_config_py_proto server_config_py_proto_init)
......@@ -87,4 +87,3 @@ add_custom_command(TARGET general_model_config_py_proto POST_BUILD
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
endif()
......@@ -20,16 +20,16 @@ namespace baidu {
namespace paddle_serving {
namespace configure {
int read_proto_conf(const std::string &conf_full_path,
google::protobuf::Message *conf);
int read_proto_conf(const std::string &conf_path,
const std::string &conf_file,
google::protobuf::Message *conf);
int write_proto_conf(google::protobuf::Message *message,
const std::string &output_path,
const std::string &output_file);
int read_proto_conf(const std::string &conf_full_path,
google::protobuf::Message *conf);
int read_proto_conf(const std::string &conf_path,
const std::string &conf_file,
google::protobuf::Message *conf);
int write_proto_conf(google::protobuf::Message *message,
const std::string &output_path,
const std::string &output_file);
} // namespace configure
} // namespace paddle_serving
......
......@@ -26,7 +26,8 @@ message FetchVar {
optional string name = 1;
optional string alias_name = 2;
optional bool is_lod_tensor = 3 [ default = false ];
repeated int32 shape = 4;
optional int32 fetch_type = 4 [ default = 0 ];
repeated int32 shape = 5;
}
message GeneralModelConfig {
repeated FeedVar feed_var = 1;
......
......@@ -52,9 +52,10 @@ message ModelToolkitConf { repeated EngineDesc engines = 1; };
message ResourceConf {
required string model_toolkit_path = 1;
required string model_toolkit_file = 2;
optional string cube_config_file = 3;
optional string general_model_path = 4;
optional string general_model_file = 5;
optional string general_model_path = 3;
optional string general_model_file = 4;
optional string cube_config_path = 5;
optional string cube_config_file = 6;
};
// DAG node depency info
......
......@@ -99,6 +99,13 @@ class CubeAPI {
std::function<void(DictValue*, size_t)> parse,
std::string* version);
/**
* @brief: get all table names from cube server, thread safe.
* @param [out] vals: vector of table names
*
*/
std::vector<std::string> get_table_names();
public:
static const char* error_msg(int error_code);
......
......@@ -682,5 +682,13 @@ int CubeAPI::opt_seek(const std::string& dict_name,
return ret;
}
std::vector<std::string> CubeAPI::get_table_names() {
const std::vector<const MetaInfo*> metas = _meta->metas();
std::vector<std::string> table_names;
for (auto itr = metas.begin(); itr != metas.end(); ++itr) {
table_names.push_back((*itr)->dict_name);
}
return table_names;
}
} // namespace mcube
} // namespace rec
......@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "core/cube/cube-server/include/cube/server.h"
#include <brpc/server.h>
#include "core/cube/cube-server/include/cube/framework.h"
#include "core/cube/cube-server/include/cube/server.h"
namespace rec {
namespace mcube {
......
if(CLIENT_ONLY)
if(CLIENT)
add_subdirectory(pybind11)
pybind11_add_module(serving_client src/general_model.cpp src/pybind_general_model.cpp)
target_link_libraries(serving_client PRIVATE -Wl,--whole-archive utils sdk-cpp pybind python -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
......
......@@ -39,9 +39,25 @@ namespace baidu {
namespace paddle_serving {
namespace general_model {
typedef std::map<std::string, std::vector<float>> FetchedMap;
class PredictorRes {
public:
PredictorRes() {}
~PredictorRes() {}
public:
const std::vector<std::vector<int64_t>>& get_int64_by_name(
const std::string& name) {
return _int64_map[name];
}
const std::vector<std::vector<float>>& get_float_by_name(
const std::string& name) {
return _float_map[name];
}
typedef std::map<std::string, std::vector<std::vector<float>>> BatchFetchedMap;
public:
std::map<std::string, std::vector<std::vector<int64_t>>> _int64_map;
std::map<std::string, std::vector<std::vector<float>>> _float_map;
};
class PredictorClient {
public:
......@@ -55,31 +71,27 @@ class PredictorClient {
void set_predictor_conf(const std::string& conf_path,
const std::string& conf_file);
int create_predictor_by_desc(const std::string & sdk_desc);
int create_predictor_by_desc(const std::string& sdk_desc);
int create_predictor();
int destroy_predictor();
std::vector<std::vector<float>> predict(
const std::vector<std::vector<float>>& float_feed,
const std::vector<std::string>& float_feed_name,
const std::vector<std::vector<int64_t>>& int_feed,
const std::vector<std::string>& int_feed_name,
const std::vector<std::string>& fetch_name);
int predict(const std::vector<std::vector<float>>& float_feed,
const std::vector<std::string>& float_feed_name,
const std::vector<std::vector<int64_t>>& int_feed,
const std::vector<std::string>& int_feed_name,
const std::vector<std::string>& fetch_name,
PredictorRes& predict_res, // NOLINT
const int& pid);
std::vector<std::vector<std::vector<float>>> batch_predict(
int batch_predict(
const std::vector<std::vector<std::vector<float>>>& float_feed_batch,
const std::vector<std::string>& float_feed_name,
const std::vector<std::vector<std::vector<int64_t>>>& int_feed_batch,
const std::vector<std::string>& int_feed_name,
const std::vector<std::string>& fetch_name);
std::vector<std::vector<float>> predict_with_profile(
const std::vector<std::vector<float>>& float_feed,
const std::vector<std::string>& float_feed_name,
const std::vector<std::vector<int64_t>>& int_feed,
const std::vector<std::string>& int_feed_name,
const std::vector<std::string>& fetch_name);
const std::vector<std::string>& fetch_name,
PredictorRes& predict_res_batch, // NOLINT
const int& pid);
private:
PredictorApi _api;
......@@ -90,6 +102,7 @@ class PredictorClient {
std::map<std::string, int> _feed_name_to_idx;
std::map<std::string, int> _fetch_name_to_idx;
std::map<std::string, std::string> _fetch_name_to_var_name;
std::map<std::string, int> _fetch_name_to_type;
std::vector<std::vector<int>> _shape;
std::vector<int> _type;
std::vector<int64_t> _last_request_ts;
......
......@@ -93,6 +93,8 @@ int PredictorClient::init(const std::string &conf_file) {
<< " alias name: " << model_config.fetch_var(i).alias_name();
_fetch_name_to_var_name[model_config.fetch_var(i).alias_name()] =
model_config.fetch_var(i).name();
_fetch_name_to_type[model_config.fetch_var(i).alias_name()] =
model_config.fetch_var(i).fetch_type();
}
} catch (std::exception &e) {
LOG(ERROR) << "Failed load general model config" << e.what();
......@@ -130,35 +132,25 @@ int PredictorClient::create_predictor() {
_api.thrd_initialize();
}
std::vector<std::vector<float>> PredictorClient::predict(
const std::vector<std::vector<float>> &float_feed,
const std::vector<std::string> &float_feed_name,
const std::vector<std::vector<int64_t>> &int_feed,
const std::vector<std::string> &int_feed_name,
const std::vector<std::string> &fetch_name) {
std::vector<std::vector<float>> fetch_result;
if (fetch_name.size() == 0) {
return fetch_result;
}
int PredictorClient::predict(const std::vector<std::vector<float>> &float_feed,
const std::vector<std::string> &float_feed_name,
const std::vector<std::vector<int64_t>> &int_feed,
const std::vector<std::string> &int_feed_name,
const std::vector<std::string> &fetch_name,
PredictorRes &predict_res,
const int &pid) { // NOLINT
predict_res._int64_map.clear();
predict_res._float_map.clear();
Timer timeline;
int64_t preprocess_start = timeline.TimeStampUS();
// we save infer_us at fetch_result[fetch_name.size()]
fetch_result.resize(fetch_name.size());
_api.thrd_clear();
_predictor = _api.fetch_predictor("general_model");
VLOG(2) << "fetch general model predictor done.";
VLOG(2) << "float feed name size: " << float_feed_name.size();
VLOG(2) << "int feed name size: " << int_feed_name.size();
VLOG(2) << "fetch name size: " << fetch_name.size();
Request req;
for (auto &name : fetch_name) {
req.add_fetch_var_names(name);
}
std::vector<Tensor *> tensor_vec;
FeedInst *inst = req.add_insts();
for (auto &name : float_feed_name) {
......@@ -168,7 +160,6 @@ std::vector<std::vector<float>> PredictorClient::predict(
for (auto &name : int_feed_name) {
tensor_vec.push_back(inst->add_tensor_array());
}
VLOG(2) << "prepare tensor vec done.";
int vec_idx = 0;
for (auto &name : float_feed_name) {
......@@ -179,16 +170,14 @@ std::vector<std::vector<float>> PredictorClient::predict(
}
tensor->set_elem_type(1);
for (int j = 0; j < float_feed[vec_idx].size(); ++j) {
tensor->add_data(const_cast<char *>(reinterpret_cast<const char *>(
&(float_feed[vec_idx][j]))),
sizeof(float));
tensor->add_float_data(float_feed[vec_idx][j]);
}
vec_idx++;
}
VLOG(2) << "feed float feed var done.";
vec_idx = 0;
for (auto &name : int_feed_name) {
int idx = _feed_name_to_idx[name];
Tensor *tensor = tensor_vec[idx];
......@@ -197,15 +186,12 @@ std::vector<std::vector<float>> PredictorClient::predict(
}
tensor->set_elem_type(0);
for (int j = 0; j < int_feed[vec_idx].size(); ++j) {
tensor->add_data(const_cast<char *>(reinterpret_cast<const char *>(
&(int_feed[vec_idx][j]))),
sizeof(int64_t));
tensor->add_int64_data(int_feed[vec_idx][j]);
}
vec_idx++;
}
int64_t preprocess_end = timeline.TimeStampUS();
int64_t client_infer_start = timeline.TimeStampUS();
Response res;
......@@ -222,27 +208,41 @@ std::vector<std::vector<float>> PredictorClient::predict(
res.Clear();
if (_predictor->inference(&req, &res) != 0) {
LOG(ERROR) << "failed call predictor with req: " << req.ShortDebugString();
exit(-1);
return -1;
} else {
VLOG(2) << "predict done.";
client_infer_end = timeline.TimeStampUS();
postprocess_start = client_infer_end;
for (auto &name : fetch_name) {
int idx = _fetch_name_to_idx[name];
int len = res.insts(0).tensor_array(idx).data_size();
VLOG(2) << "fetch name: " << name;
VLOG(2) << "tensor data size: " << len;
fetch_result[idx].resize(len);
for (int i = 0; i < len; ++i) {
fetch_result[idx][i] =
*(const float *)res.insts(0).tensor_array(idx).data(i).c_str();
if (_fetch_name_to_type[name] == 0) {
int len = res.insts(0).tensor_array(idx).int64_data_size();
VLOG(2) << "fetch tensor : " << name << " type: int64 len : " << len;
predict_res._int64_map[name].resize(1);
predict_res._int64_map[name][0].resize(len);
for (int i = 0; i < len; ++i) {
predict_res._int64_map[name][0][i] =
res.insts(0).tensor_array(idx).int64_data(i);
}
} else if (_fetch_name_to_type[name] == 1) {
int len = res.insts(0).tensor_array(idx).float_data_size();
VLOG(2) << "fetch tensor : " << name << " type: float32 len : " << len;
predict_res._float_map[name].resize(1);
predict_res._float_map[name][0].resize(len);
for (int i = 0; i < len; ++i) {
predict_res._float_map[name][0][i] =
res.insts(0).tensor_array(idx).float_data(i);
}
}
postprocess_end = timeline.TimeStampUS();
}
postprocess_end = timeline.TimeStampUS();
}
if (FLAGS_profile_client) {
std::ostringstream oss;
oss << "PROFILE\t"
<< "pid:" << pid << "\t"
<< "prepro_0:" << preprocess_start << " "
<< "prepro_1:" << preprocess_end << " "
<< "client_infer_0:" << client_infer_start << " "
......@@ -261,30 +261,25 @@ std::vector<std::vector<float>> PredictorClient::predict(
fprintf(stderr, "%s\n", oss.str().c_str());
}
return fetch_result;
return 0;
}
std::vector<std::vector<std::vector<float>>> PredictorClient::batch_predict(
int PredictorClient::batch_predict(
const std::vector<std::vector<std::vector<float>>> &float_feed_batch,
const std::vector<std::string> &float_feed_name,
const std::vector<std::vector<std::vector<int64_t>>> &int_feed_batch,
const std::vector<std::string> &int_feed_name,
const std::vector<std::string> &fetch_name) {
const std::vector<std::string> &fetch_name,
PredictorRes &predict_res_batch,
const int &pid) {
int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
std::vector<std::vector<std::vector<float>>> fetch_result_batch;
if (fetch_name.size() == 0) {
return fetch_result_batch;
}
predict_res_batch._int64_map.clear();
predict_res_batch._float_map.clear();
Timer timeline;
int64_t preprocess_start = timeline.TimeStampUS();
fetch_result_batch.resize(batch_size);
int fetch_name_num = fetch_name.size();
for (int bi = 0; bi < batch_size; bi++) {
fetch_result_batch[bi].resize(fetch_name_num);
}
_api.thrd_clear();
_predictor = _api.fetch_predictor("general_model");
......@@ -321,9 +316,7 @@ std::vector<std::vector<std::vector<float>>> PredictorClient::batch_predict(
}
tensor->set_elem_type(1);
for (int j = 0; j < float_feed[vec_idx].size(); ++j) {
tensor->add_data(const_cast<char *>(reinterpret_cast<const char *>(
&(float_feed[vec_idx][j]))),
sizeof(float));
tensor->add_float_data(float_feed[vec_idx][j]);
}
vec_idx++;
}
......@@ -342,15 +335,13 @@ std::vector<std::vector<std::vector<float>>> PredictorClient::batch_predict(
VLOG(3) << "feed var name " << name << " index " << vec_idx
<< "first data " << int_feed[vec_idx][0];
for (int j = 0; j < int_feed[vec_idx].size(); ++j) {
tensor->add_data(const_cast<char *>(reinterpret_cast<const char *>(
&(int_feed[vec_idx][j]))),
sizeof(int64_t));
tensor->add_int64_data(int_feed[vec_idx][j]);
}
vec_idx++;
}
VLOG(2) << "batch [" << bi << "] "
<< "itn feed value prepared";
<< "int feed value prepared";
}
int64_t preprocess_end = timeline.TimeStampUS();
......@@ -376,20 +367,35 @@ std::vector<std::vector<std::vector<float>>> PredictorClient::batch_predict(
} else {
client_infer_end = timeline.TimeStampUS();
postprocess_start = client_infer_end;
for (auto &name : fetch_name) {
predict_res_batch._int64_map[name].resize(batch_size);
predict_res_batch._float_map[name].resize(batch_size);
}
for (int bi = 0; bi < batch_size; bi++) {
for (auto &name : fetch_name) {
int idx = _fetch_name_to_idx[name];
int len = res.insts(bi).tensor_array(idx).data_size();
VLOG(2) << "fetch name: " << name;
VLOG(2) << "tensor data size: " << len;
fetch_result_batch[bi][idx].resize(len);
VLOG(2)
<< "fetch name " << name << " index " << idx << " first data "
<< *(const float *)res.insts(bi).tensor_array(idx).data(0).c_str();
for (int i = 0; i < len; ++i) {
fetch_result_batch[bi][idx][i] =
*(const float *)res.insts(bi).tensor_array(idx).data(i).c_str();
if (_fetch_name_to_type[name] == 0) {
int len = res.insts(bi).tensor_array(idx).int64_data_size();
VLOG(2) << "fetch tensor : " << name << " type: int64 len : " << len;
predict_res_batch._int64_map[name][bi].resize(len);
VLOG(2) << "fetch name " << name << " index " << idx << " first data "
<< res.insts(bi).tensor_array(idx).int64_data(0);
for (int i = 0; i < len; ++i) {
predict_res_batch._int64_map[name][bi][i] =
res.insts(bi).tensor_array(idx).int64_data(i);
}
} else if (_fetch_name_to_type[name] == 1) {
int len = res.insts(bi).tensor_array(idx).float_data_size();
VLOG(2) << "fetch tensor : " << name
<< " type: float32 len : " << len;
predict_res_batch._float_map[name][bi].resize(len);
VLOG(2) << "fetch name " << name << " index " << idx << " first data "
<< res.insts(bi).tensor_array(idx).float_data(0);
for (int i = 0; i < len; ++i) {
predict_res_batch._float_map[name][bi][i] =
res.insts(bi).tensor_array(idx).float_data(i);
}
}
}
}
......@@ -399,6 +405,7 @@ std::vector<std::vector<std::vector<float>>> PredictorClient::batch_predict(
if (FLAGS_profile_client) {
std::ostringstream oss;
oss << "PROFILE\t"
<< "pid:" << pid << "\t"
<< "prepro_0:" << preprocess_start << " "
<< "prepro_1:" << preprocess_end << " "
<< "client_infer_0:" << client_infer_start << " "
......@@ -417,17 +424,7 @@ std::vector<std::vector<std::vector<float>>> PredictorClient::batch_predict(
fprintf(stderr, "%s\n", oss.str().c_str());
}
return fetch_result_batch;
}
std::vector<std::vector<float>> PredictorClient::predict_with_profile(
const std::vector<std::vector<float>> &float_feed,
const std::vector<std::string> &float_feed_name,
const std::vector<std::vector<int64_t>> &int_feed,
const std::vector<std::string> &int_feed_name,
const std::vector<std::string> &fetch_name) {
std::vector<std::vector<float>> res;
return res;
return 0;
}
} // namespace general_model
......
......@@ -17,18 +17,18 @@
#include "core/general-client/include/general_model.h"
using namespace std;
using namespace std; // NOLINT
using baidu::paddle_serving::general_model::PredictorClient;
using baidu::paddle_serving::general_model::FetchedMap;
int main(int argc, char * argv[]) {
PredictorClient * client = new PredictorClient();
int main(int argc, char* argv[]) {
PredictorClient* client = new PredictorClient();
client->init("inference.conf");
client->set_predictor_conf("./", "predictor.conf");
client->create_predictor();
std::vector<std::vector<float> > float_feed;
std::vector<std::vector<int64_t> > int_feed;
std::vector<std::vector<float>> float_feed;
std::vector<std::vector<int64_t>> int_feed;
std::vector<std::string> float_feed_name;
std::vector<std::string> int_feed_name = {"words", "label"};
std::vector<std::string> fetch_name = {"cost", "acc", "prediction"};
......@@ -53,13 +53,14 @@ int main(int argc, char * argv[]) {
cin >> label;
int_feed.push_back({label});
FetchedMap result;
client->predict(
float_feed, float_feed_name,
int_feed, int_feed_name, fetch_name,
&result);
client->predict(float_feed,
float_feed_name,
int_feed,
int_feed_name,
fetch_name,
&result);
cout << label << "\t" << result["prediction"][1] << endl;
......
......@@ -20,8 +20,6 @@
namespace py = pybind11;
using baidu::paddle_serving::general_model::FetchedMap;
namespace baidu {
namespace paddle_serving {
namespace general_model {
......@@ -29,6 +27,20 @@ namespace general_model {
PYBIND11_MODULE(serving_client, m) {
m.doc() = R"pddoc(this is a practice
)pddoc";
py::class_<PredictorRes>(m, "PredictorRes", py::buffer_protocol())
.def(py::init())
.def("get_int64_by_name",
[](PredictorRes &self, std::string &name) {
return self.get_int64_by_name(name);
},
py::return_value_policy::reference)
.def("get_float_by_name",
[](PredictorRes &self, std::string &name) {
return self.get_float_by_name(name);
},
py::return_value_policy::reference);
py::class_<PredictorClient>(m, "PredictorClient", py::buffer_protocol())
.def(py::init())
.def("init_gflags",
......@@ -46,8 +58,9 @@ PYBIND11_MODULE(serving_client, m) {
self.set_predictor_conf(conf_path, conf_file);
})
.def("create_predictor_by_desc",
[](PredictorClient &self, const std::string & sdk_desc) {
self.create_predictor_by_desc(sdk_desc); })
[](PredictorClient &self, const std::string &sdk_desc) {
self.create_predictor_by_desc(sdk_desc);
})
.def("create_predictor",
[](PredictorClient &self) { self.create_predictor(); })
.def("destroy_predictor",
......@@ -58,14 +71,17 @@ PYBIND11_MODULE(serving_client, m) {
const std::vector<std::string> &float_feed_name,
const std::vector<std::vector<int64_t>> &int_feed,
const std::vector<std::string> &int_feed_name,
const std::vector<std::string> &fetch_name) {
const std::vector<std::string> &fetch_name,
PredictorRes &predict_res,
const int &pid) {
return self.predict(float_feed,
float_feed_name,
int_feed,
int_feed_name,
fetch_name);
fetch_name,
predict_res,
pid);
})
.def("batch_predict",
[](PredictorClient &self,
const std::vector<std::vector<std::vector<float>>>
......@@ -74,12 +90,16 @@ PYBIND11_MODULE(serving_client, m) {
const std::vector<std::vector<std::vector<int64_t>>>
&int_feed_batch,
const std::vector<std::string> &int_feed_name,
const std::vector<std::string> &fetch_name) {
const std::vector<std::string> &fetch_name,
PredictorRes &predict_res_batch,
const int &pid) {
return self.batch_predict(float_feed_batch,
float_feed_name,
int_feed_batch,
int_feed_name,
fetch_name);
fetch_name,
predict_res_batch,
pid);
});
}
......
include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../kvdb/include)
include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../../)
include(op/CMakeLists.txt)
include(proto/CMakeLists.txt)
add_executable(serving ${serving_srcs})
......@@ -24,8 +24,6 @@ target_link_libraries(serving pdserving)
target_link_libraries(serving cube-api)
target_link_libraries(serving utils)
target_link_libraries(serving kvdb rocksdb)
if(WITH_GPU)
target_link_libraries(serving ${CUDA_LIBRARIES})
endif()
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "core/general-server/op/general_copy_op.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <sstream>
#include "core/general-server/op/general_infer_helper.h"
#include "core/predictor/framework/infer.h"
#include "core/predictor/framework/memory.h"
#include "core/util/include/timer.h"
namespace baidu {
namespace paddle_serving {
namespace serving {
using baidu::paddle_serving::Timer;
using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FeedInst;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
int GeneralCopyOp::inference() {
// reade request from client
const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name());
VLOG(2) << "precedent name: " << pre_name();
const TensorVector *in = &input_blob->tensor_vector;
VLOG(2) << "input size: " << in->size();
int batch_size = input_blob->GetBatchSize();
int input_var_num = 0;
GeneralBlob *res = mutable_data<GeneralBlob>();
TensorVector *out = &res->tensor_vector;
VLOG(2) << "input batch size: " << batch_size;
res->SetBatchSize(batch_size);
if (!res) {
LOG(ERROR) << "Failed get op tls reader object output";
}
Timer timeline;
int64_t start = timeline.TimeStampUS();
VLOG(2) << "Going to init lod tensor";
for (int i = 0; i < in->size(); ++i) {
paddle::PaddleTensor lod_tensor;
CopyLod(&in->at(i), &lod_tensor);
lod_tensor.dtype = in->at(i).dtype;
lod_tensor.name = in->at(i).name;
VLOG(2) << "lod tensor [" << i << "].name = " << lod_tensor.name;
out->push_back(lod_tensor);
}
VLOG(2) << "pack done.";
for (int i = 0; i < out->size(); ++i) {
int64_t *src_ptr = static_cast<int64_t *>(in->at(i).data.data());
out->at(i).data.Resize(out->at(i).lod[0].back() * sizeof(int64_t));
out->at(i).shape = {out->at(i).lod[0].back(), 1};
int64_t *tgt_ptr = static_cast<int64_t *>(out->at(i).data.data());
for (int j = 0; j < out->at(i).lod[0].back(); ++j) {
tgt_ptr[j] = src_ptr[j];
}
}
VLOG(2) << "output done.";
timeline.Pause();
int64_t end = timeline.TimeStampUS();
CopyBlobInfo(input_blob, res);
AddBlobInfo(res, start);
AddBlobInfo(res, end);
VLOG(2) << "read data from client success";
return 0;
}
DEFINE_OP(GeneralCopyOp);
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#ifdef BCLOUD
#ifdef WITH_GPU
#include "paddle/paddle_inference_api.h"
#else
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#endif
#else
#include "paddle_inference_api.h" // NOLINT
#endif
#include <string>
#include "core/general-server/general_model_service.pb.h"
#include "core/general-server/op/general_infer_helper.h"
#include "core/predictor/framework/resource.h"
namespace baidu {
namespace paddle_serving {
namespace serving {
class GeneralCopyOp
: public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
public:
typedef std::vector<paddle::PaddleTensor> TensorVector;
DECLARE_OP(GeneralCopyOp);
int inference();
};
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "core/general-server/op/general_dist_kv_infer_op.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <sstream>
#include <unordered_map>
#include <utility>
#include "core/cube/cube-api/include/cube_api.h"
#include "core/predictor/framework/infer.h"
#include "core/predictor/framework/memory.h"
#include "core/predictor/framework/resource.h"
#include "core/util/include/timer.h"
namespace baidu {
namespace paddle_serving {
namespace serving {
using baidu::paddle_serving::Timer;
using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Response;
using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FetchInst;
using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
int GeneralDistKVInferOp::inference() {
VLOG(2) << "Going to run inference";
const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name());
VLOG(2) << "Get precedent op name: " << pre_name();
GeneralBlob *output_blob = mutable_data<GeneralBlob>();
if (!input_blob) {
LOG(ERROR) << "Failed mutable depended argument, op:" << pre_name();
return -1;
}
const TensorVector *in = &input_blob->tensor_vector;
TensorVector *out = &output_blob->tensor_vector;
int batch_size = input_blob->GetBatchSize();
VLOG(2) << "input batch size: " << batch_size;
std::vector<uint64_t> keys;
std::vector<rec::mcube::CubeValue> values;
int sparse_count = 0;
int dense_count = 0;
std::vector<std::pair<int64_t *, size_t>> dataptr_size_pairs;
size_t key_len = 0;
for (size_t i = 0; i < in->size(); ++i) {
if (in->at(i).dtype != paddle::PaddleDType::INT64) {
++dense_count;
continue;
}
++sparse_count;
size_t elem_num = 1;
for (size_t s = 0; s < in->at(i).shape.size(); ++s) {
elem_num *= in->at(i).shape[s];
}
key_len += elem_num;
int64_t *data_ptr = static_cast<int64_t *>(in->at(i).data.data());
dataptr_size_pairs.push_back(std::make_pair(data_ptr, elem_num));
}
keys.resize(key_len);
int key_idx = 0;
for (size_t i = 0; i < dataptr_size_pairs.size(); ++i) {
std::copy(dataptr_size_pairs[i].first,
dataptr_size_pairs[i].first + dataptr_size_pairs[i].second,
keys.begin() + key_idx);
key_idx += dataptr_size_pairs[i].second;
}
rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
std::vector<std::string> table_names = cube->get_table_names();
if (table_names.size() == 0) {
LOG(ERROR) << "cube init error or cube config not given.";
return -1;
}
int ret = cube->seek(table_names[0], keys, &values);
if (values.size() != keys.size() || values[0].buff.size() == 0) {
LOG(ERROR) << "cube value return null";
}
size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
TensorVector sparse_out;
sparse_out.resize(sparse_count);
TensorVector dense_out;
dense_out.resize(dense_count);
int cube_val_idx = 0;
int sparse_idx = 0;
int dense_idx = 0;
std::unordered_map<int, int> in_out_map;
baidu::paddle_serving::predictor::Resource &resource =
baidu::paddle_serving::predictor::Resource::instance();
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config();
for (size_t i = 0; i < in->size(); ++i) {
if (in->at(i).dtype != paddle::PaddleDType::INT64) {
dense_out[dense_idx] = in->at(i);
++dense_idx;
continue;
}
sparse_out[sparse_idx].lod.resize(in->at(i).lod.size());
for (size_t x = 0; x < sparse_out[sparse_idx].lod.size(); ++x) {
sparse_out[sparse_idx].lod[x].resize(in->at(i).lod[x].size());
std::copy(in->at(i).lod[x].begin(),
in->at(i).lod[x].end(),
sparse_out[sparse_idx].lod[x].begin());
}
sparse_out[sparse_idx].dtype = paddle::PaddleDType::FLOAT32;
sparse_out[sparse_idx].shape.push_back(
sparse_out[sparse_idx].lod[0].back());
sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
sparse_out[sparse_idx].name = model_config->_feed_name[i];
sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
EMBEDDING_SIZE * sizeof(float));
float *dst_ptr = static_cast<float *>(sparse_out[sparse_idx].data.data());
for (int x = 0; x < sparse_out[sparse_idx].lod[0].back(); ++x) {
float *data_ptr = dst_ptr + x * EMBEDDING_SIZE;
memcpy(data_ptr,
values[cube_val_idx].buff.data(),
values[cube_val_idx].buff.size());
cube_val_idx++;
}
++sparse_idx;
}
TensorVector infer_in;
infer_in.insert(infer_in.end(), dense_out.begin(), dense_out.end());
infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end());
output_blob->SetBatchSize(batch_size);
VLOG(2) << "infer batch size: " << batch_size;
Timer timeline;
int64_t start = timeline.TimeStampUS();
timeline.Start();
if (InferManager::instance().infer(
GENERAL_MODEL_NAME, &infer_in, out, batch_size)) {
LOG(ERROR) << "Failed do infer in fluid model: " << GENERAL_MODEL_NAME;
return -1;
}
int64_t end = timeline.TimeStampUS();
CopyBlobInfo(input_blob, output_blob);
AddBlobInfo(output_blob, start);
AddBlobInfo(output_blob, end);
return 0;
}
DEFINE_OP(GeneralDistKVInferOp);
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#ifdef BCLOUD
#ifdef WITH_GPU
#include "paddle/paddle_inference_api.h"
#else
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#endif
#else
#include "paddle_inference_api.h" // NOLINT
#endif
#include "core/general-server/general_model_service.pb.h"
#include "core/general-server/op/general_infer_helper.h"
namespace baidu {
namespace paddle_serving {
namespace serving {
class GeneralDistKVInferOp
: public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
public:
typedef std::vector<paddle::PaddleTensor> TensorVector;
DECLARE_OP(GeneralDistKVInferOp);
int inference();
};
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
......@@ -65,6 +65,19 @@ static void CopyBlobInfo(const GeneralBlob* src, GeneralBlob* tgt) {
src->p_size * sizeof(int64_t));
}
static void CopyLod(const paddle::PaddleTensor* src,
paddle::PaddleTensor* tgt) {
VLOG(2) << "copy lod done.";
tgt->lod.resize(src->lod.size());
VLOG(2) << "src lod size: " << src->lod.size();
for (int i = 0; i < src->lod.size(); ++i) {
tgt->lod[i].resize(src->lod[i].size());
for (int j = 0; j < src->lod[i].size(); ++j) {
tgt->lod[i][j] = src->lod[i][j];
}
}
}
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
......@@ -39,7 +39,6 @@ class GeneralInferOp
DECLARE_OP(GeneralInferOp);
int inference();
};
} // namespace serving
......
......@@ -104,17 +104,21 @@ int GeneralReaderOp::inference() {
VLOG(2) << "print general model config done.";
// TODO(guru4elephant): how to do conditional check?
/*
int ret = conf_check(req, model_config);
if (ret != 0) {
LOG(INFO) << "model conf of server:";
LOG(ERROR) << "model conf of server:";
resource.print_general_model_config(model_config);
return 0;
}
*/
// package tensor
elem_type.resize(var_num);
elem_size.resize(var_num);
capacity.resize(var_num);
// prepare basic information for input
for (int i = 0; i < var_num; ++i) {
paddle::PaddleTensor lod_tensor;
elem_type[i] = req->insts(0).tensor_array(i).elem_type();
......@@ -146,14 +150,22 @@ int GeneralReaderOp::inference() {
out->push_back(lod_tensor);
}
// specify the memory needed for output tensor_vector
for (int i = 0; i < var_num; ++i) {
if (out->at(i).lod.size() == 1) {
for (int j = 0; j < batch_size; ++j) {
const Tensor &tensor = req->insts(j).tensor_array(i);
int data_len = tensor.data_size();
VLOG(2) << "tensor size for var[" << i << "]: " << tensor.data_size();
int data_len = 0;
if (tensor.int64_data_size() > 0) {
data_len = tensor.int64_data_size();
} else {
data_len = tensor.float_data_size();
}
VLOG(2) << "tensor size for var[" << i << "]: " << data_len;
int cur_len = out->at(i).lod[0].back();
VLOG(2) << "current len: " << cur_len;
out->at(i).lod[0].push_back(cur_len + data_len);
VLOG(2) << "new len: " << cur_len + data_len;
}
......@@ -168,14 +180,15 @@ int GeneralReaderOp::inference() {
}
}
// fill the data into output general_blob
for (int i = 0; i < var_num; ++i) {
if (elem_type[i] == 0) {
int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
int offset = 0;
for (int j = 0; j < batch_size; ++j) {
for (int k = 0; k < req->insts(j).tensor_array(i).data_size(); ++k) {
dst_ptr[offset + k] =
*(const int64_t *)req->insts(j).tensor_array(i).data(k).c_str();
int elem_num = req->insts(j).tensor_array(i).int64_data_size();
for (int k = 0; k < elem_num; ++k) {
dst_ptr[offset + k] = req->insts(j).tensor_array(i).int64_data(k);
}
if (out->at(i).lod.size() == 1) {
offset = out->at(i).lod[0][j + 1];
......@@ -187,9 +200,9 @@ int GeneralReaderOp::inference() {
float *dst_ptr = static_cast<float *>(out->at(i).data.data());
int offset = 0;
for (int j = 0; j < batch_size; ++j) {
for (int k = 0; k < req->insts(j).tensor_array(i).data_size(); ++k) {
dst_ptr[offset + k] =
*(const float *)req->insts(j).tensor_array(i).data(k).c_str();
int elem_num = req->insts(j).tensor_array(i).float_data_size();
for (int k = 0; k < elem_num; ++k) {
dst_ptr[offset + k] = req->insts(j).tensor_array(i).float_data(k);
}
if (out->at(i).lod.size() == 1) {
offset = out->at(i).lod[0][j + 1];
......@@ -200,6 +213,8 @@ int GeneralReaderOp::inference() {
}
}
VLOG(2) << "output size: " << out->size();
timeline.Pause();
int64_t end = timeline.TimeStampUS();
res->p_size = 0;
......
......@@ -24,24 +24,23 @@
#include "paddle_inference_api.h" // NOLINT
#endif
#include <string>
#include "core/predictor/framework/resource.h"
#include "core/general-server/op/general_infer_helper.h"
#include "core/general-server/general_model_service.pb.h"
#include "core/general-server/load_general_model_service.pb.h"
#include "core/general-server/op/general_infer_helper.h"
#include "core/predictor/framework/resource.h"
namespace baidu {
namespace paddle_serving {
namespace serving {
class GeneralReaderOp : public baidu::paddle_serving::predictor::OpWithChannel<
GeneralBlob> {
class GeneralReaderOp
: public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
public:
typedef std::vector<paddle::PaddleTensor> TensorVector;
DECLARE_OP(GeneralReaderOp);
int inference();
};
} // namespace serving
......
......@@ -95,36 +95,67 @@ int GeneralResponseOp::inference() {
int var_idx = 0;
for (auto &idx : fetch_index) {
float *data_ptr = static_cast<float *>(in->at(idx).data.data());
int cap = 1;
for (int j = 1; j < in->at(idx).shape.size(); ++j) {
cap *= in->at(idx).shape[j];
}
if (model_config->_is_lod_fetch[idx]) {
for (int j = 0; j < batch_size; ++j) {
for (int k = in->at(idx).lod[0][j]; k < in->at(idx).lod[0][j + 1];
k++) {
res->mutable_insts(j)->mutable_tensor_array(var_idx)->add_data(
reinterpret_cast<char *>(&(data_ptr[k])), sizeof(float));
if (in->at(idx).dtype == paddle::PaddleDType::INT64) {
int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data());
if (model_config->_is_lod_fetch[idx]) {
for (int j = 0; j < batch_size; ++j) {
for (int k = in->at(idx).lod[0][j]; k < in->at(idx).lod[0][j + 1];
k++) {
FetchInst *fetch_p = res->mutable_insts(j);
fetch_p->mutable_tensor_array(var_idx)->add_int64_data(data_ptr[k]);
}
}
} else {
int var_size = in->at(idx).shape[0];
if (var_size == batch_size) {
for (int j = 0; j < batch_size; ++j) {
for (int k = j * cap; k < (j + 1) * cap; ++k) {
FetchInst *fetch_p = res->mutable_insts(j);
fetch_p->mutable_tensor_array(var_idx)->add_int64_data(
data_ptr[k]);
}
}
} else {
for (int j = 0; j < batch_size; ++j) {
FetchInst *fetch_p = res->mutable_insts(j);
fetch_p->mutable_tensor_array(var_idx)->add_int64_data(data_ptr[0]);
}
}
}
} else {
int var_size = in->at(idx).shape[0];
if (var_size == batch_size) {
var_idx++;
} else if (in->at(idx).dtype == paddle::PaddleDType::FLOAT32) {
float *data_ptr = static_cast<float *>(in->at(idx).data.data());
if (model_config->_is_lod_fetch[idx]) {
for (int j = 0; j < batch_size; ++j) {
for (int k = j * cap; k < (j + 1) * cap; ++k) {
res->mutable_insts(j)->mutable_tensor_array(var_idx)->add_data(
reinterpret_cast<char *>(&(data_ptr[k])), sizeof(float));
for (int k = in->at(idx).lod[0][j]; k < in->at(idx).lod[0][j + 1];
k++) {
FetchInst *fetch_p = res->mutable_insts(j);
fetch_p->mutable_tensor_array(var_idx)->add_float_data(data_ptr[k]);
}
}
} else {
for (int j = 0; j < batch_size; ++j) {
res->mutable_insts(j)->mutable_tensor_array(var_idx)->add_data(
reinterpret_cast<char *>(&(data_ptr[0])), sizeof(float));
int var_size = in->at(idx).shape[0];
if (var_size == batch_size) {
for (int j = 0; j < batch_size; ++j) {
for (int k = j * cap; k < (j + 1) * cap; ++k) {
FetchInst *fetch_p = res->mutable_insts(j);
fetch_p->mutable_tensor_array(var_idx)->add_float_data(
data_ptr[k]);
}
}
} else {
for (int j = 0; j < batch_size; ++j) {
FetchInst *fetch_p = res->mutable_insts(j);
fetch_p->mutable_tensor_array(var_idx)->add_float_data(data_ptr[0]);
}
}
}
var_idx++;
}
var_idx++;
}
if (req->profile_server()) {
......
......@@ -39,7 +39,6 @@ class GeneralResponseOp
DECLARE_OP(GeneralResponseOp);
int inference();
};
} // namespace serving
......
......@@ -12,11 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "core/general-server/op/general_text_reader_op.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <sstream>
#include "core/general-server/op/general_text_reader_op.h"
#include "core/predictor/framework/infer.h"
#include "core/predictor/framework/memory.h"
#include "core/util/include/timer.h"
......@@ -32,7 +32,6 @@ using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FeedInst;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
int GeneralTextReaderOp::inference() {
// reade request from client
const Request *req = dynamic_cast<const Request *>(get_request_message());
......@@ -132,11 +131,9 @@ int GeneralTextReaderOp::inference() {
int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
int offset = 0;
for (int j = 0; j < batch_size; ++j) {
for (int k = 0;
k < req->insts(j).tensor_array(i).int_data_size();
for (int k = 0; k < req->insts(j).tensor_array(i).int_data_size();
++k) {
dst_ptr[offset + k] =
req->insts(j).tensor_array(i).int_data(k);
dst_ptr[offset + k] = req->insts(j).tensor_array(i).int_data(k);
}
if (out->at(i).lod.size() == 1) {
offset = out->at(i).lod[0][j + 1];
......@@ -148,11 +145,9 @@ int GeneralTextReaderOp::inference() {
float *dst_ptr = static_cast<float *>(out->at(i).data.data());
int offset = 0;
for (int j = 0; j < batch_size; ++j) {
for (int k = 0;
k < req->insts(j).tensor_array(i).int_data_size();
for (int k = 0; k < req->insts(j).tensor_array(i).int_data_size();
++k) {
dst_ptr[offset + k] =
req->insts(j).tensor_array(i).int_data(k);
dst_ptr[offset + k] = req->insts(j).tensor_array(i).int_data(k);
}
if (out->at(i).lod.size() == 1) {
offset = out->at(i).lod[0][j + 1];
......
......@@ -24,17 +24,17 @@
#include "paddle_inference_api.h" // NOLINT
#endif
#include <string>
#include "core/predictor/framework/resource.h"
#include "core/general-server/op/general_infer_helper.h"
#include "core/general-server/general_model_service.pb.h"
#include "core/general-server/load_general_model_service.pb.h"
#include "core/general-server/op/general_infer_helper.h"
#include "core/predictor/framework/resource.h"
namespace baidu {
namespace paddle_serving {
namespace serving {
class GeneralTextReaderOp :
public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
class GeneralTextReaderOp
: public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
public:
typedef std::vector<paddle::PaddleTensor> TensorVector;
......
......@@ -40,7 +40,6 @@ class GeneralTextResponseOp
DECLARE_OP(GeneralTextResponseOp);
int inference();
};
} // namespace serving
......
......@@ -22,18 +22,15 @@ option cc_generic_services = true;
message Tensor {
repeated bytes data = 1;
repeated int32 int_data = 2;
repeated float float_data = 3;
optional int32 elem_type = 4;
repeated int32 shape = 5;
repeated int64 int64_data = 3;
repeated float float_data = 4;
optional int32 elem_type = 5;
repeated int32 shape = 6;
};
message FeedInst {
repeated Tensor tensor_array = 1;
};
message FeedInst { repeated Tensor tensor_array = 1; };
message FetchInst {
repeated Tensor tensor_array = 1;
};
message FetchInst { repeated Tensor tensor_array = 1; };
message Request {
repeated FeedInst insts = 1;
......
......@@ -27,11 +27,11 @@
// limitations under the License.
#pragma once
#include <chrono>
#include <chrono> // NOLINT
#include <functional>
#include <memory>
#include <unordered_map>
#include <vector>
#include <functional>
class AbstractKVDB;
class FileReader;
class ParamDict;
......@@ -65,7 +65,7 @@ class FileReader {
std::string data;
FILE *stream = nullptr;
const int max_buffer = 256;
char buffer[max_buffer];
char buffer[max_buffer]; // NOLINT
cmd.append(" 2>&1");
stream = popen(cmd.c_str(), "r");
if (stream) {
......@@ -76,7 +76,8 @@ class FileReader {
return data;
};
std::string cmd = "md5sum " + this->filename_;
// TODO: throw exception if error occurs during execution of shell command
// NOLINT TODO: throw exception if error occurs during execution of shell
// command
std::string md5val = getCmdOut(cmd);
this->time_stamp_ = md5val == this->last_md5_val_
? this->time_stamp_
......@@ -93,7 +94,7 @@ class FileReader {
return this->time_stamp_;
}
inline virtual ~FileReader(){};
inline virtual ~FileReader() {}
private:
std::string filename_;
......@@ -128,7 +129,7 @@ class ParamDict {
virtual ~ParamDict();
private:
std::function<std::pair<Key, Value>(std::string)> read_func_;
std::function<std::pair<Key, Value>(std::string)> read_func_; // NOLINT
std::vector<FileReaderPtr> file_reader_lst_;
AbsKVDBPtr front_db, back_db;
};
......@@ -139,5 +140,5 @@ class ParamDictMgr {
void InsertParamDict(std::string, ParamDictPtr);
private:
std::unordered_map<std::string, ParamDictPtr> ParamDictMap;
std::unordered_map<std::string, ParamDictPtr> ParamDictMap; // NOLINT
};
......@@ -25,7 +25,7 @@
class RocksDBWrapper {
public:
RocksDBWrapper(std::string db_name);
RocksDBWrapper(std::string db_name); // NOLINT
std::string Get(std::string key);
bool Put(std::string key, std::string value);
......@@ -33,6 +33,7 @@ class RocksDBWrapper {
static std::shared_ptr<RocksDBWrapper> RocksDBWrapperFactory(
std::string db_name = "SparseMatrix");
void Close();
private:
rocksdb::DB *db_;
std::string db_name_;
......
......@@ -16,7 +16,7 @@
#include <fstream>
#include <iterator>
#include <sstream>
#include <thread>
#include <thread> // NOLINT
#include "core/kvdb/include/kvdb/rocksdb_impl.h"
std::vector<FileReaderPtr> ParamDict::GetDictReaderLst() {
......@@ -33,8 +33,10 @@ void ParamDict::SetFileReaderLst(std::vector<std::string> lst) {
std::vector<float> ParamDict::GetSparseValue(std::string feasign,
std::string slot) {
auto BytesToFloat = [](uint8_t* byte_array) { return *((float*)byte_array); };
// TODO: the concatation of feasign and slot is TBD.
auto BytesToFloat = [](uint8_t* byte_array) {
return *((float*)byte_array); // NOLINT
};
// NOLINT TODO: the concatation of feasign and slot is TBD.
std::string result = front_db->Get(feasign + slot);
std::vector<float> value;
if (result == "NOT_FOUND") return value;
......@@ -87,7 +89,7 @@ bool ParamDict::InsertSparseValue(std::string feasign,
value.push_back(raw_values_ptr[i]);
}
back_db->Set(key, value);
// TODO: change stateless to stateful
// NOLINT TODO: change stateless to stateful
return true;
}
......@@ -140,5 +142,4 @@ void ParamDict::CreateKVDB() {
this->back_db->CreateDB();
}
ParamDict::~ParamDict() {
}
ParamDict::~ParamDict() {}
......@@ -51,7 +51,7 @@ void RocksDBWrapper::SetDBName(std::string db_name) {
void RocksDBWrapper::Close() {
if (db_ != nullptr) {
db_->Close();
delete(db_);
delete (db_);
db_ = nullptr;
}
}
......
......@@ -32,12 +32,8 @@ void RocksKVDB::Set(std::string key, std::string value) {
return;
}
void RocksKVDB::Close() {
this->db_->Close();
}
void RocksKVDB::Close() { this->db_->Close(); }
std::string RocksKVDB::Get(std::string key) { return this->db_->Get(key); }
RocksKVDB::~RocksKVDB() {
this->db_->Close();
}
RocksKVDB::~RocksKVDB() { this->db_->Close(); }
......@@ -15,14 +15,14 @@
#include <list>
#include "boost/algorithm/string.hpp"
#include "boost/scoped_ptr.hpp"
#include "core/pdcodegen/pds_option.pb.h"
#include "core/pdcodegen/plugin/strutil.h"
#include "core/pdcodegen/plugin/substitute.h"
#include "google/protobuf/compiler/code_generator.h"
#include "google/protobuf/compiler/plugin.h"
#include "google/protobuf/descriptor.h"
#include "google/protobuf/io/printer.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "core/pdcodegen/pds_option.pb.h"
#include "core/pdcodegen/plugin/strutil.h"
#include "core/pdcodegen/plugin/substitute.h"
using std::string;
using google::protobuf::Descriptor;
using google::protobuf::FileDescriptor;
......@@ -115,7 +115,8 @@ class PdsCodeGenerator : public CodeGenerator {
printer.Print("#include \"core/predictor/common/inner_common.h\"\n");
printer.Print("#include \"core/predictor/framework/service.h\"\n");
printer.Print("#include \"core/predictor/framework/manager.h\"\n");
printer.Print("#include \"core/predictor/framework/service_manager.h\"\n");
printer.Print(
"#include \"core/predictor/framework/service_manager.h\"\n");
}
if (generate_stub) {
printer.Print("#include <baidu/rpc/parallel_channel.h>\n");
......@@ -845,7 +846,8 @@ class PdsCodeGenerator : public CodeGenerator {
printer.Print("#include \"core/predictor/common/inner_common.h\"\n");
printer.Print("#include \"core/predictor/framework/service.h\"\n");
printer.Print("#include \"core/predictor/framework/manager.h\"\n");
printer.Print("#include \"core/predictor/framework/service_manager.h\"\n");
printer.Print(
"#include \"core/predictor/framework/service_manager.h\"\n");
}
if (generate_stub) {
printer.Print("#include <brpc/parallel_channel.h>\n");
......
......@@ -3,20 +3,19 @@ include(common/CMakeLists.txt)
include(op/CMakeLists.txt)
include(mempool/CMakeLists.txt)
include(framework/CMakeLists.txt)
#include(plugin/CMakeLists.txt)
include(tools/CMakeLists.txt)
include(src/CMakeLists.txt)
include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../kvdb/include)
add_library(pdserving ${pdserving_srcs})
set_source_files_properties(
${pdserving_srcs}
PROPERTIES
COMPILE_FLAGS "-Wno-strict-aliasing -Wno-unused-variable -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
add_dependencies(pdserving protobuf kvdb boost brpc leveldb pdcodegen configure)
add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure)
target_link_libraries(pdserving
brpc protobuf boost leveldb configure kvdb -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
# install
install(TARGETS pdserving
......
FILE(GLOB common_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
FILE(GLOB common_srcs ${CMAKE_CURRENT_LIST_DIR}/constant.cpp)
LIST(APPEND pdserving_srcs ${common_srcs})
......@@ -52,9 +52,9 @@
#include "glog/raw_logging.h"
#include "core/configure/general_model_config.pb.h"
#include "core/configure/include/configure_parser.h"
#include "core/configure/server_configure.pb.h"
#include "core/configure/general_model_config.pb.h"
#include "core/predictor/common/constant.h"
#include "core/predictor/common/types.h"
......
......@@ -45,7 +45,7 @@ int DagView::init(Dag* dag, const std::string& service_name) {
<< "at:" << si;
return ERR_MEM_ALLOC_FAILURE;
}
VLOG(2) << "stage[" << si << "] name: " << stage->full_name;
VLOG(2) << "stage[" << si << "] name: " << stage->full_name;
VLOG(2) << "stage[" << si << "] node size: " << stage->nodes.size();
vstage->full_name = service_name + NAME_DELIMITER + stage->full_name;
uint32_t node_size = stage->nodes.size();
......@@ -74,7 +74,7 @@ int DagView::init(Dag* dag, const std::string& service_name) {
LOG(WARNING) << "Failed init op, type:" << node->type;
return ERR_INTERNAL_FAILURE;
}
op->set_full_name(service_name + NAME_DELIMITER + node->full_name);
vnode->conf = node;
vnode->op = op;
......@@ -85,9 +85,9 @@ int DagView::init(Dag* dag, const std::string& service_name) {
VLOG(2) << "set op pre name: \n"
<< "current op name: " << vstage->nodes.back()->op->op_name()
<< " previous op name: "
<< _view[si-1]->nodes.back()->op->op_name();
<< _view[si - 1]->nodes.back()->op->op_name();
vstage->nodes.back()->op->set_pre_node_name(
_view[si-1]->nodes.back()->op->op_name());
_view[si - 1]->nodes.back()->op->op_name());
}
_view.push_back(vstage);
}
......
......@@ -16,8 +16,8 @@
#include <map>
#include <string>
#include <utility>
#include "glog/raw_logging.h"
#include "core/predictor/common/inner_common.h"
#include "glog/raw_logging.h"
namespace baidu {
namespace paddle_serving {
namespace predictor {
......
......@@ -39,8 +39,6 @@ DynamicResource::~DynamicResource() {}
int DynamicResource::initialize() { return 0; }
std::shared_ptr<RocksDBWrapper> Resource::getDB() { return db; }
std::shared_ptr<PaddleGeneralModelConfig> Resource::get_general_model_config() {
return _config;
}
......@@ -143,9 +141,16 @@ int Resource::initialize(const std::string& path, const std::string& file) {
LOG(ERROR) << "unable to create tls_bthread_key of thrd_data";
return -1;
}
// init rocksDB instance
if (db.get() == nullptr) {
db = RocksDBWrapper::RocksDBWrapperFactory("kvdb");
// init rocksDB or cube instance
if (resource_conf.has_cube_config_file() &&
resource_conf.has_cube_config_path()) {
LOG(INFO) << "init cube client, path[ " << resource_conf.cube_config_path()
<< " ], config file [ " << resource_conf.cube_config_file()
<< " ].";
rec::mcube::CubeAPI* cube = rec::mcube::CubeAPI::instance();
std::string cube_config_fullpath = "./" + resource_conf.cube_config_path() +
"/" + resource_conf.cube_config_file();
this->cube_config_fullpath = cube_config_fullpath;
}
THREAD_SETSPECIFIC(_tls_bspec_key, NULL);
......@@ -155,6 +160,15 @@ int Resource::initialize(const std::string& path, const std::string& file) {
// model config
int Resource::general_model_initialize(const std::string& path,
const std::string& file) {
if (this->cube_config_fullpath.size() != 0) {
LOG(INFO) << "init cube by config file : " << this->cube_config_fullpath;
rec::mcube::CubeAPI* cube = rec::mcube::CubeAPI::instance();
int ret = cube->init(this->cube_config_fullpath.c_str());
if (ret != 0) {
LOG(ERROR) << "cube init error";
return -1;
}
}
VLOG(2) << "general model path: " << path;
VLOG(2) << "general model file: " << file;
if (!FLAGS_enable_general_model) {
......@@ -197,13 +211,10 @@ int Resource::general_model_initialize(const std::string& path,
for (int i = 0; i < feed_var_num; ++i) {
_config->_feed_name[i] = model_config.feed_var(i).name();
_config->_feed_alias_name[i] = model_config.feed_var(i).alias_name();
VLOG(2) << "feed var[" << i << "]: "
<< _config->_feed_name[i];
VLOG(2) << "feed var[" << i << "]: "
<< _config->_feed_alias_name[i];
VLOG(2) << "feed var[" << i << "]: " << _config->_feed_name[i];
VLOG(2) << "feed var[" << i << "]: " << _config->_feed_alias_name[i];
_config->_feed_type[i] = model_config.feed_var(i).feed_type();
VLOG(2) << "feed type[" << i << "]: "
<< _config->_feed_type[i];
VLOG(2) << "feed type[" << i << "]: " << _config->_feed_type[i];
if (model_config.feed_var(i).is_lod_tensor()) {
VLOG(2) << "var[" << i << "] is lod tensor";
......
......@@ -13,12 +13,11 @@
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include <map>
#include "core/cube/cube-api/include/cube_api.h"
#include "core/kvdb/include/kvdb/paddle_rocksdb.h"
#include "core/predictor/common/inner_common.h"
#include "core/predictor/framework/infer.h"
#include "core/predictor/framework/memory.h"
......@@ -36,15 +35,15 @@ class PaddleGeneralModelConfig {
public:
std::vector<std::string> _feed_name;
std::vector<std::string> _feed_alias_name;
std::vector<int> _feed_type; // 0 int64, 1 float
std::vector<bool> _is_lod_feed; // true lod tensor
std::vector<int> _feed_type; // 0 int64, 1 float
std::vector<bool> _is_lod_feed; // true lod tensor
std::vector<bool> _is_lod_fetch; // whether a fetch var is lod_tensor
std::vector<int> _capacity; // capacity for each tensor
/*
feed_shape_ for feeded variable
feed_shape_[i][j] represents the jth dim for ith input Tensor
if is_lod_feed_[i] == False, feed_shape_[i][0] = -1
*/
std::vector<int> _capacity; // capacity for each tensor
/*
feed_shape_ for feeded variable
feed_shape_[i][j] represents the jth dim for ith input Tensor
if is_lod_feed_[i] == False, feed_shape_[i][0] = -1
*/
std::vector<std::vector<int>> _feed_shape;
std::vector<std::string> _fetch_name;
......@@ -101,8 +100,6 @@ class Resource {
void print_general_model_config(
const std::shared_ptr<PaddleGeneralModelConfig>& config);
std::shared_ptr<RocksDBWrapper> getDB();
DynamicResource* get_dynamic_resource() {
return reinterpret_cast<DynamicResource*>(
THREAD_GETSPECIFIC(_tls_bspec_key));
......@@ -110,8 +107,8 @@ class Resource {
private:
int thread_finalize() { return 0; }
std::shared_ptr<RocksDBWrapper> db;
std::shared_ptr<PaddleGeneralModelConfig> _config;
std::string cube_config_fullpath;
THREAD_KEY_T _tls_bspec_key;
};
......
......@@ -7,6 +7,7 @@ LIST(APPEND protofiles
${CMAKE_CURRENT_LIST_DIR}/./builtin_format.proto
${CMAKE_CURRENT_LIST_DIR}/./msg_data.proto
${CMAKE_CURRENT_LIST_DIR}/./xrecord_format.proto
${CMAKE_CURRENT_LIST_DIR}/./framework.proto
)
PROTOBUF_GENERATE_SERVING_CPP(TRUE PROTO_SRCS PROTO_HDRS ${protofiles})
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package paddle.framework.proto;
// Any incompatible changes to ProgramDesc and its dependencies should
// raise the version defined version.h.
//
// Serailization and Deserialization codes should be modified in a way
// that supports old versions following the version and compatibility policy.
message Version { optional int64 version = 1 [ default = 0 ]; }
enum AttrType {
INT = 0;
FLOAT = 1;
STRING = 2;
INTS = 3;
FLOATS = 4;
STRINGS = 5;
BOOLEAN = 6;
BOOLEANS = 7;
BLOCK = 8;
LONG = 9;
BLOCKS = 10;
LONGS = 11;
}
// OpDesc describes an instance of a C++ framework::OperatorBase
// derived class type.
message OpDesc {
message Attr {
required string name = 1;
required AttrType type = 2;
optional int32 i = 3;
optional float f = 4;
optional string s = 5;
repeated int32 ints = 6;
repeated float floats = 7;
repeated string strings = 8;
optional bool b = 10;
repeated bool bools = 11;
optional int32 block_idx = 12;
optional int64 l = 13;
repeated int32 blocks_idx = 14;
repeated int64 longs = 15;
};
message Var {
required string parameter = 1;
repeated string arguments = 2;
};
required string type = 3;
repeated Var inputs = 1;
repeated Var outputs = 2;
repeated Attr attrs = 4;
optional bool is_target = 5 [ default = false ];
};
// OpProto describes a C++ framework::OperatorBase derived class.
message OpProto {
// VarProto describes the C++ type framework::Variable.
message Var {
required string name = 1;
required string comment = 2;
optional bool duplicable = 3 [ default = false ];
optional bool intermediate = 4 [ default = false ];
optional bool dispensable = 5 [ default = false ];
}
// AttrProto describes the C++ type Attribute.
message Attr {
required string name = 1;
required AttrType type = 2;
required string comment = 3;
// If that attribute is generated, it means the Paddle third
// language binding has responsibility to fill that
// attribute. End-User should not set that attribute.
optional bool generated = 4 [ default = false ];
}
required string type = 1;
repeated Var inputs = 2;
repeated Var outputs = 3;
repeated Attr attrs = 4;
required string comment = 5;
}
message VarType {
enum Type {
// Pod Types
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
// Tensor<size_t> is used in C++.
SIZE_T = 19;
UINT8 = 20;
INT8 = 21;
// Other types that may need additional descriptions
LOD_TENSOR = 7;
SELECTED_ROWS = 8;
FEED_MINIBATCH = 9;
FETCH_LIST = 10;
STEP_SCOPES = 11;
LOD_RANK_TABLE = 12;
LOD_TENSOR_ARRAY = 13;
PLACE_LIST = 14;
READER = 15;
// Any runtime decided variable type is raw
// raw variables should manage their own allocations
// in operators like nccl_op
RAW = 17;
TUPLE = 18;
}
required Type type = 1;
message TensorDesc {
// Should only be PODType. Is enforced in C++
required Type data_type = 1;
repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
}
optional TensorDesc selected_rows = 2;
message LoDTensorDesc {
required TensorDesc tensor = 1;
optional int32 lod_level = 2 [ default = 0 ];
}
optional LoDTensorDesc lod_tensor = 3;
message LoDTensorArrayDesc {
required TensorDesc tensor = 1;
optional int32 lod_level = 2 [ default = 0 ];
}
optional LoDTensorArrayDesc tensor_array = 4;
message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
optional ReaderDesc reader = 5;
message Tuple { repeated Type element_type = 1; }
optional Tuple tuple = 7;
}
message VarDesc {
required string name = 1;
required VarType type = 2;
optional bool persistable = 3 [ default = false ];
// True if the variable is an input data and
// have to check the feed data shape and dtype
optional bool need_check_feed = 4 [ default = false ];
}
message BlockDesc {
required int32 idx = 1;
required int32 parent_idx = 2;
repeated VarDesc vars = 3;
repeated OpDesc ops = 4;
optional int32 forward_block_idx = 5 [ default = -1 ];
}
// CompatibleInfo is used to determine if a feature is compatible and
// provides the information.
message CompatibleInfo {
enum Type {
COMPATIBLE = 0;
DEFINITELY_NOT = 1;
POSSIBLE = 2;
BUG_FIX = 3;
PRECISION_CHANGE = 4;
}
required string version = 1;
required Type type = 2;
}
// In some cases, Paddle Fluid may perform operator definition iterations,
// and the operator uses OpCompatibleMap for compatibility testing.
message OpCompatibleMap {
message OpCompatiblePair {
required string op_name = 1;
required CompatibleInfo compatible_info = 2;
}
repeated OpCompatiblePair pair = 1;
optional string default_required_version = 2;
}
// Please refer to
// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
// for more details.
// TODO(panyx0718): A model can have multiple programs. Need a
// way to distinguish them. Maybe ID or name?
message ProgramDesc {
reserved 2; // For backward compatibility.
repeated BlockDesc blocks = 1;
optional Version version = 4;
optional OpCompatibleMap op_compatible_map = 3;
}
......@@ -99,8 +99,8 @@ static void g_change_server_port() {
if (read_proto_conf(FLAGS_inferservice_path.c_str(),
FLAGS_inferservice_file.c_str(),
&conf) != 0) {
VLOG(2) << "failed to load configure[" << FLAGS_inferservice_path
<< "," << FLAGS_inferservice_file << "].";
VLOG(2) << "failed to load configure[" << FLAGS_inferservice_path << ","
<< FLAGS_inferservice_file << "].";
return;
}
uint32_t port = conf.port();
......@@ -157,8 +157,7 @@ int main(int argc, char** argv) {
mkdir(FLAGS_log_dir.c_str(), 0777);
ret = stat(FLAGS_log_dir.c_str(), &st_buf);
if (ret != 0) {
VLOG(2) << "Log path " << FLAGS_log_dir
<< " not exist, and create fail";
VLOG(2) << "Log path " << FLAGS_log_dir << " not exist, and create fail";
return -1;
}
}
......
set(seq_gen_src ${CMAKE_CURRENT_LIST_DIR}/seq_generator.cpp ${CMAKE_CURRENT_LIST_DIR}/seq_file.cpp)
LIST(APPEND seq_gen_src ${PROTO_SRCS})
add_executable(seq_generator ${seq_gen_src})
target_link_libraries(seq_generator protobuf -lpthread)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <netinet/in.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <ctime>
#include <iostream>
#include <unistd.h>
#include "seq_file.h"
SeqFileWriter::SeqFileWriter(const char *file) {
_fs = new std::ofstream(file, std::ios::binary);
std::srand(std::time(0));
for (int i = 0; i < SYNC_MARKER_SIZE; ++i) {
_sync_marker[i] = std::rand() % 255;
}
_write_seq_header();
_bytes_to_prev_sync = 0;
}
void SeqFileWriter::close() {
_fs->close();
delete _fs;
}
SeqFileWriter::~SeqFileWriter() { close(); }
void SeqFileWriter::_write_sync_marker() {
char begin[] = {'\xFF', '\xFF', '\xFF', '\xFF'};
_fs->write(begin, 4);
_fs->write(_sync_marker, SYNC_MARKER_SIZE);
}
void SeqFileWriter::_write_seq_header() {
_fs->write(SEQ_HEADER, sizeof(SEQ_HEADER) - 1);
_fs->write(_sync_marker, SYNC_MARKER_SIZE);
}
int SeqFileWriter::write(const char *key,
size_t key_len,
const char *value,
size_t value_len) {
if (key_len != sizeof(int64_t)) {
std::cout << "Key length not equal to " << sizeof(int64_t) << std::endl;
return -1;
}
uint32_t record_len = key_len + value_len;
uint32_t b_record_len = htonl(record_len);
uint32_t b_key_len = htonl((uint32_t)key_len);
// std::cout << "b_record_len " << b_record_len << " record_len " <<
// record_len << std::endl;
_fs->write((char *)&b_record_len, sizeof(uint32_t));
_fs->write((char *)&b_key_len, sizeof(uint32_t));
_fs->write(key, key_len);
_fs->write(value, value_len);
_bytes_to_prev_sync += record_len;
if (_bytes_to_prev_sync >= SYNC_INTERVAL) {
_write_sync_marker();
_bytes_to_prev_sync = 0;
}
return 0;
}
/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __SEQ_FILE_H_
#define __SEQ_FILE_H_
#include <fstream>
const int SYNC_MARKER_SIZE = 16;
const char SEQ_HEADER[] =
"SEQ\x06"
"\"org.apache.hadoop.io.BytesWritable\""
"org.apache.hadoop.io.BytesWritable"
"\x00\x00\x00\x00\x00\x00";
const int SYNC_INTERVAL = 2000;
class SeqFileWriter {
public:
SeqFileWriter(const char *file);
~SeqFileWriter();
public:
int write(const char *key,
size_t key_len,
const char *value,
size_t value_len);
private:
void close();
void _write_sync_marker();
void _write_seq_header();
private:
char _sync_marker[SYNC_MARKER_SIZE];
int _bytes_to_prev_sync;
std::ofstream *_fs;
};
#endif //__SEQ_FILE_H_
/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <iostream>
#include <memory>
#include "core/predictor/framework.pb.h"
#include "seq_file.h"
using paddle::framework::proto::VarType;
std::map<int, size_t> var_type_size;
void reg_var_types() {
var_type_size[static_cast<int>(VarType::FP16)] = sizeof(int16_t);
var_type_size[static_cast<int>(VarType::FP32)] = sizeof(float);
var_type_size[static_cast<int>(VarType::FP64)] = sizeof(double);
var_type_size[static_cast<int>(VarType::INT32)] = sizeof(int);
var_type_size[static_cast<int>(VarType::INT64)] = sizeof(int64_t);
var_type_size[static_cast<int>(VarType::BOOL)] = sizeof(bool);
var_type_size[static_cast<int>(VarType::SIZE_T)] = sizeof(size_t);
var_type_size[static_cast<int>(VarType::INT16)] = sizeof(int16_t);
var_type_size[static_cast<int>(VarType::UINT8)] = sizeof(uint8_t);
var_type_size[static_cast<int>(VarType::INT8)] = sizeof(int8_t);
}
int dump_parameter(const char *input_file, const char *output_file) {
std::ifstream is(input_file);
// the 1st field, unit32_t version for LoDTensor
uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version));
if (version != 0) {
std::cout << "Version number " << version << " not supported" << std::endl;
return -1;
}
// the 2st field, LoD information
uint64_t lod_level;
is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
std::vector<std::vector<size_t>> lod;
lod.resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::vector<size_t> tmp(size / sizeof(size_t));
is.read(reinterpret_cast<char *>(tmp.data()),
static_cast<std::streamsize>(size));
lod[i] = tmp;
}
// the 3st filed, Tensor
// Note: duplicate version field
is.read(reinterpret_cast<char *>(&version), sizeof(version));
if (version != 0) {
std::cout << "Version number " << version << " not supported" << std::endl;
return -1;
}
// int32_t size
// proto buffer
VarType::TensorDesc desc;
int32_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char *>(buf.get()), size);
if (!desc.ParseFromArray(buf.get(), size)) {
std::cout << "Cannot parse tensor desc" << std::endl;
return -1;
}
// read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
std::cout << "Dims:";
for (auto x : dims) {
std::cout << " " << x;
}
std::cout << std::endl;
if (dims.size() != 2) {
std::cout << "Parameter dims not 2D" << std::endl;
return -1;
}
size_t numel = 1;
for (auto x : dims) {
numel *= x;
}
size_t buf_size = numel * var_type_size[desc.data_type()];
char *tensor_buf = new char[buf_size];
is.read(static_cast<char *>(tensor_buf), buf_size);
is.close();
SeqFileWriter seq_file_writer(output_file);
int value_buf_len = var_type_size[desc.data_type()] * dims[1];
char *value_buf = new char[value_buf_len];
size_t offset = 0;
for (int64_t i = 0; i < dims[0]; ++i) {
// std::cout << "key_len " << key_len << " value_len " << value_buf_len <<
// std::endl;
memcpy(value_buf, tensor_buf + offset, value_buf_len);
seq_file_writer.write((char *)&i, sizeof(i), value_buf, value_buf_len);
offset += value_buf_len;
}
return 0;
}
int main(int argc, char **argv) {
if (argc != 3) {
std::cout << "Usage: seq_generator PARAMETER_FILE OUTPUT_FILE" << std::endl;
return -1;
}
reg_var_types();
dump_parameter(argv[1], argv[2]);
}
/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
......@@ -15,8 +15,8 @@
#pragma once
#include <gtest/gtest.h>
#include "core/predictor/framework/channel.h"
#include "core/predictor/op/op.h"
#include "core/predictor/msg_data.pb.h"
#include "core/predictor/op/op.h"
namespace baidu {
namespace paddle_serving {
......
......@@ -13,7 +13,7 @@
// limitations under the License.
#include "core/predictor/unittest/test_server_manager.h" // TestServerManager
#include <gflags/gflags.h> // FLAGS
#include <gflags/gflags.h> // FLAGS
#include <string>
#include "core/predictor/framework/server.h" // ServerManager
......
......@@ -53,9 +53,9 @@
#include "json2pb/json_to_pb.h"
#endif
#include "core/configure/general_model_config.pb.h"
#include "core/configure/include/configure_parser.h"
#include "core/configure/sdk_configure.pb.h"
#include "core/configure/general_model_config.pb.h"
#include "core/sdk-cpp/include/utils.h"
......
......@@ -32,9 +32,9 @@ class EndpointConfigManager {
EndpointConfigManager()
: _last_update_timestamp(0), _current_endpointmap_id(1) {}
int create(const std::string & sdk_desc_str);
int create(const std::string& sdk_desc_str);
int load(const std::string & sdk_desc_str);
int load(const std::string& sdk_desc_str);
int create(const char* path, const char* file);
......
......@@ -16,9 +16,9 @@
#include <map>
#include <string>
#include <utility>
#include "glog/raw_logging.h"
#include "core/sdk-cpp/include/common.h"
#include "core/sdk-cpp/include/stub_impl.h"
#include "glog/raw_logging.h"
namespace baidu {
namespace paddle_serving {
......
......@@ -31,7 +31,7 @@ class PredictorApi {
int register_all();
int create(const std::string & sdk_desc_str);
int create(const std::string& sdk_desc_str);
int create(const char* path, const char* file);
......
......@@ -20,20 +20,17 @@ package baidu.paddle_serving.predictor.general_model;
option cc_generic_services = true;
message Tensor {
repeated bytes data = 1; // most general format
repeated int32 int_data = 2; // for simple debug only
repeated float float_data = 3; // for simple debug only
optional int32 elem_type = 4; // support int64, float32
repeated int32 shape = 5;
repeated bytes data = 1;
repeated int32 int_data = 2;
repeated int64 int64_data = 3;
repeated float float_data = 4;
optional int32 elem_type = 5;
repeated int32 shape = 6;
};
message FeedInst {
repeated Tensor tensor_array = 1;
};
message FeedInst { repeated Tensor tensor_array = 1; };
message FetchInst {
repeated Tensor tensor_array = 1;
};
message FetchInst { repeated Tensor tensor_array = 1; };
message Request {
repeated FeedInst insts = 1;
......
......@@ -35,8 +35,7 @@ int Endpoint::initialize(const EndpointInfo& ep_info) {
return -1;
}
_variant_list.push_back(var);
VLOG(2) << "Succ create variant: " << vi
<< ", endpoint:" << _endpoint_name;
VLOG(2) << "Succ create variant: " << vi << ", endpoint:" << _endpoint_name;
}
return 0;
......@@ -76,7 +75,7 @@ int Endpoint::thrd_finalize() {
return -1;
}
}
LOG(INFO) << "Succ thrd finalize all vars: " << var_size;
VLOG(2) << "Succ thrd finalize all vars: " << var_size;
return 0;
}
......
......@@ -30,7 +30,7 @@ int PredictorApi::register_all() {
return 0;
}
int PredictorApi::create(const std::string & api_desc_str) {
int PredictorApi::create(const std::string& api_desc_str) {
VLOG(2) << api_desc_str;
if (register_all() != 0) {
LOG(ERROR) << "Failed do register all!";
......
......@@ -54,7 +54,7 @@ int Variant::initialize(const EndpointInfo& ep_info,
if (_stub_map.size() > 0) {
VLOG(2) << "Initialize variants from VariantInfo"
<< ", stubs count: " << _stub_map.size();
<< ", stubs count: " << _stub_map.size();
return 0;
}
......
include(src/CMakeLists.txt)
add_library(utils ${util_srcs})
......@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once
#include <stdlib.h>
namespace baidu {
namespace paddle_serving {
......
FILE(GLOB srcs ${CMAKE_CURRENT_LIST_DIR}/*.cc)
LIST(APPEND util_srcs ${srcs})
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <sys/time.h>
#include "core/util/include/timer.h"
#include <sys/time.h>
namespace baidu {
namespace paddle_serving {
......@@ -56,7 +56,7 @@ double Timer::ElapsedSec() { return _elapsed / 1000000.0; }
int64_t Timer::TimeStampUS() {
gettimeofday(&_now, NULL);
return _now.tv_usec;
return _now.tv_sec * 1000 * 1000L + _now.tv_usec;
}
int64_t Timer::Tickus() {
......
# 如何编译PaddleServing
### 编译环境设置
- os: CentOS 6u3
- gcc: 4.8.2及以上
- go: 1.9.2及以上
- git:2.17.1及以上
- cmake:3.2.2及以上
- python:2.7.2及以上
### 获取代码
``` python
git clone https://github.com/PaddlePaddle/Serving
git submodule update --init --recursive
cd Serving && git submodule update --init --recursive
```
### 编译Server部分
#### PYTHONROOT设置
``` shell
# 例如python的路径为/usr/bin/python,可以设置PYTHONROOT
export PYTHONROOT=/usr/
```
#### 集成CPU版本Paddle Inference Library
``` shell
cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=/home/users/dongdaxiang/software/baidu/third-party/python/bin/python -DCLIENT_ONLY=OFF ..
cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT_ONLY=OFF ..
make -j10
```
#### 集成GPU版本Paddle Inference Library
``` shell
cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=/home/users/dongdaxiang/software/baidu/third-party/python/bin/python -DCLIENT_ONLY=ON -DWITH_GPU=ON ..
cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT_ONLY=OFF -DWITH_GPU=ON ..
make -j10
```
### 编译Client部分
``` shell
cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=/home/users/dongdaxiang/software/baidu/third-party/python/bin/python -DCLIENT_ONLY=ON ..
cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT_ONLY=ON ..
make -j10
```
### 安装wheel包
无论是client端还是server端,编译完成后,安装python/dist/下的whl包即可
### 注意事项
运行python端server时,会检查`SERVING_BIN`环境变量,如果想使用自己编译的二进制文件,请将设置该环境变量为对应二进制文件的路径,通常是`export SERVING_BIN=${BUILD_PATH}/core/general-server/serving`
......@@ -38,20 +38,27 @@ Paddle Serving uses this [Git branching model](http://nvie.com/posts/a-successfu
pre-commit install
```
Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python.
Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python. At the same time, cpplint and pylint are required to check the code style of C/C++ and Python respectively. You may need to install cpplint and pylint by running the following commands:
```bash
pip install cpplint pylint
```
Once installed, `pre-commit` checks the style of code and documentation in every commit. We will see something like the following when you run `git commit`:
```shell
$ git commit
CRLF end-lines remover...............................(no files to check)Skipped
yapf.................................................(no files to check)Skipped
yapf.....................................................................Passed
Check for added large files..............................................Passed
Check for merge conflicts................................................Passed
Check for broken symlinks................................................Passed
Detect Private Key...................................(no files to check)Skipped
Fix End of Files.....................................(no files to check)Skipped
clang-formater.......................................(no files to check)Skipped
Fix End of Files.........................................................Passed
clang-format.............................................................Passed
cpplint..................................................................Passed
pylint...................................................................Passed
copyright_checker........................................................Passed
[my-cool-stuff c703c041] add test file
1 file changed, 0 insertions(+), 0 deletions(-)
create mode 100644 233
......@@ -149,7 +156,6 @@ GLOG_minloglevel=1 bin/serving
1 - WARNING
2 -ERROR
2 - ERROR
3 - FATAL (Be careful as FATAL log will generate a coredump)
# Paddle Serving设计文档
## 1. 整体设计目标
- 长期使命:Paddle Serving是一个PaddlePaddle开源的在线服务框架,长期目标就是围绕着人工智能落地的最后一公里提供越来越专业、可靠、易用的服务。
- 工业级:为了达到工业级深度学习模型在线部署的要求,
Paddle Serving提供很多大规模场景需要的部署功能:1)分布式稀疏参数索引功能;2)高并发底层通信能力;3)模型管理、在线A/B流量测试、模型热加载。
- 简单易用:为了让使用Paddle的用户能够以极低的成本部署模型,PaddleServing设计了一套与Paddle训练框架无缝打通的预测部署API,普通模型可以使用一行命令进行服务部署。
- 功能扩展:当前,Paddle Serving支持C++、Python、Golang的客户端,未来也会面向不同类型的客户新增多种语言的客户端。在Paddle Serving的框架设计方面,尽管当前Paddle Serving以支持Paddle模型的部署为核心功能,
用户可以很容易嵌入其他的机器学习库部署在线预测。
## 2. 模块设计与实现
### 2.1 Python API接口设计
#### 2.1.1 训练模型的保存
Paddle的模型预测需要重点关注的内容:1)模型的输入变量;2)模型的输出变量;3)模型结构和模型参数。Paddle Serving Python API提供用户可以在训练过程中保存模型的接口,并将Paddle Serving在部署阶段需要保存的配置打包保存,一个示例如下:
``` python
import paddle_serving_client.io as serving_io
serving_io.save_model("serving_model", "client_conf",
{"words": data}, {"prediction": prediction},
fluid.default_main_program())
```
代码示例中,`{"words": data}``{"prediction": prediction}`分别指定了模型的输入和输出,`"words"``"prediction"`是输出和输出变量的别名,设计别名的目的是为了使开发者能够记忆自己训练模型的输入输出对应的字段。`data``prediction`则是Paddle训练过程中的`[Variable](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Variable_cn.html#variable)`,通常代表张量([Tensor](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Tensor_cn.html#tensor))或变长张量([LodTensor](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor))。调用保存命令后,会按照用户指定的`"serving_model"``"client_conf"`生成两个目录,内容如下:
``` shell
.
├── client_conf
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
└── serving_model
├── embedding_0.w_0
├── fc_0.b_0
├── fc_0.w_0
├── fc_1.b_0
├── fc_1.w_0
├── fc_2.b_0
├── fc_2.w_0
├── lstm_0.b_0
├── lstm_0.w_0
├── __model__
├── serving_server_conf.prototxt
└── serving_server_conf.stream.prototxt
```
其中,`"serving_client_conf.prototxt"``"serving_server_conf.prototxt"`是Paddle Serving的Client和Server端需要加载的配置,`"serving_client_conf.stream.prototxt"``"serving_server_conf.stream.prototxt"`是配置文件的二进制形式。`"serving_model"`下保存的其他内容和Paddle保存的模型文件是一致的。我们会考虑未来在Paddle框架中直接保存可服务的配置,实现配置保存对用户无感。
#### 2.1.2 服务端模型加载
服务端的预测逻辑可以通过Paddle Serving Server端的API进行人工定义,一个例子:
``` python
import paddle_serving_server as serving
op_maker = serving.OpMaker()
read_op = op_maker.create('general_reader')
dist_kv_op = op_maker.create('general_dist_kv')
general_infer_op = op_maker.create('general_infer')
general_response_op = op_maker.create('general_response')
op_seq_maker = serving.OpSeqMaker()
op_seq_maker.add_op(read_op)
op_seq_maker.add_op(dist_kv_op)
op_seq_maker.add_op(general_infer_op)
op_seq_maker.add_op(general_response_op)
```
当前Paddle Serving在Server端支持的主要Op请参考如下列表:
<center>
| Op 名称 | 描述 |
|--------------|------|
| `general_reader` | 通用数据格式的读取Op |
| `genreal_infer` | 通用数据格式的Paddle预测Op |
| `general_response` | 通用数据格式的响应Op |
| `general_dist_kv` | 分布式索引Op |
</center>
当前Paddle Serving中的预估引擎支持在CPU/GPU上进行预测,对应的预测服务安装包以及镜像也有两个。但无论是CPU上进行模型预估还是GPU上进行模型预估,普通模型的预测都可用一行命令进行启动。
``` shell
python -m paddle_serving_server.serve --model your_servable_model --thread 10 --port 9292
```
``` shell
python -m paddle_serving_server_gpu.serve --model your_servable_model --thread 10 --port 9292
```
启动命令的选项列表如下:
<center>
| 参数 | 类型 | 默认值 | 描述 |
|--------------|------|-----------|--------------------------------|
| `thread` | int | `4` | 服务端的并发数,通常与CPU核数一致即可 |
| `port` | int | `9292` | 服务暴露给用户的端口 |
| `name` | str | `""` | 服务名称,当用户指定时代表直接启动的是HTTP服务 |
| `model` | str | `""` | 服务端模型文件夹路径 |
| `gpu_ids` | str | `""` | 仅在paddle_serving_server_gpu中可以使用,功能与CUDA_VISIBLE_DEVICES一致 |
</center>
举例`python -m paddle_serving_server.serve --model your_servable_model --thread 10 --port 9292`对应到具体的Server端具体配置如下
``` python
from paddle_serving_server import OpMaker, OpSeqMaker, Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
general_infer_op = op_maker.create('general_infer')
general_response_op = op_maker.create('general_response')
op_seq_maker = OpSeqMaker()
op_seq_maker.add_op(read_op)
op_seq_maker.add_op(general_infer_op)
op_seq_maker.add_op(general_response_op)
server = Server()
server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(10)
server.load_model_config(your_servable_model)
server.prepare_server(port=9292, device="cpu")
server.run_server()
```
#### 2.1.3 客户端访问API
Paddle Serving支持远程服务访问的协议一种是基于RPC,另一种是HTTP。用户通过RPC访问,可以使用Paddle Serving提供的Python Client API,通过定制输入数据的格式来实现服务访问。下面的例子解释Paddle Serving Client如何定义输入数据。保存可部署模型时需要指定每个输入的别名,例如`sparse``dense`,对应的数据可以是离散的ID序列`[1, 1001, 100001]`,也可以是稠密的向量`[0.2, 0.5, 0.1, 0.4, 0.11, 0.22]`。当前Client的设计,对于离散的ID序列,支持Paddle中的`lod_level=0``lod_level=1`的情况,即张量以及一维变长张量。对于稠密的向量,支持`N-D Tensor`。用户不需要显式指定输入数据的形状,Paddle Serving的Client API会通过保存配置时记录的输入形状进行对应的检查。
``` python
feed_dict["sparse"] = [1, 1001, 100001]
feed_dict["dense"] = [0.2, 0.5, 0.1, 0.4, 0.11, 0.22]
fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
```
Client链接Server的代码,通常只需要加载保存模型时保存的Client端配置,以及指定要去访问的服务端点即可。为了保持内部访问进行数据并行的扩展能力,Paddle Serving Client允许定义多个服务端点。
``` python
client = Client()
client.load_client_config('servable_client_configs')
client.connect(["127.0.0.1:9292"])
```
### 2.2 底层通信机制
Paddle Serving采用[baidu-rpc](https://github.com/apache/incubator-brpc)进行底层的通信。baidu-rpc是百度开源的一款PRC通信库,具有高并发、低延时等特点,已经支持了包括百度在内上百万在线预估实例、上千个在线预估服务,稳定可靠。
### 2.3 核心执行引擎
Paddle Serving的核心执行引擎是一个有向无环图,图中的每个节点代表预估服务的一个环节,例如计算模型预测打分就是其中一个环节。有向无环图有利于可并发节点充分利用部署实例内的计算资源,缩短延时。一个例子,当同一份输入需要送入两个不同的模型进行预估,并将两个模型预估的打分进行加权求和时,两个模型的打分过程即可以通过有向无环图的拓扑关系并发。
<p align="center">
<br>
<img src='design_doc.png'">
<br>
<p>
### 2.4 微服务插件模式
由于Paddle Serving底层采用基于C++的通信组件,并且核心框架也是基于C/C++编写,当用户想要在服务端定义复杂的前处理与后处理逻辑时,一种办法是修改Paddle Serving底层框架,重新编译源码。另一种方式可以通过在服务端嵌入轻量级的Web服务,通过在Web服务中实现更复杂的预处理逻辑,从而搭建一套逻辑完整的服务。当访问量超过了Web服务能够接受的范围,开发者有足够的理由开发一些高性能的C++预处理逻辑,并嵌入到Serving的原生服务库中。Web服务和RPC服务的关系以及他们的组合方式可以参考下文`用户类型`中的说明。
## 3. 工业级特性
### 3.1 分布式稀疏参数索引
分布式稀疏参数索引通常在广告推荐中出现,并与分布式训练配合形成完整的离线-在线一体化部署。下图解释了其中的流程,产品的在线服务接受用户请求后将请求发送给预估服务,同时系统会记录用户的请求以进行相应的训练日志处理和拼接。离线分布式训练系统会针对流式产出的训练日志进行模型增量训练,而增量产生的模型会配送至分布式稀疏参数索引服务,同时对应的稠密的模型参数也会配送至在线的预估服务。在线服务由两部分组成,一部分是针对用户的请求提取特征后,将需要进行模型的稀疏参数索引的特征发送请求给分布式稀疏参数索引服务,针对分布式稀疏参数索引服务返回的稀疏参数再进行后续深度学习模型的计算流程,从而完成预估。
<p align="center">
<br>
<img src='cube_eng.png' width = "450" height = "230">
<br>
<p>
为什么要使用Paddle Serving提供的分布式稀疏参数索引服务?1)在一些推荐场景中,模型的输入特征规模通常可以达到上千亿,单台机器无法支撑T级别模型在内存的保存,因此需要进行分布式存储。2)Paddle Serving提供的分布式稀疏参数索引服务,具有并发请求多个节点的能力,从而以较低的延时完成预估服务。
### 3.2 模型管理、在线A/B流量测试、模型热加载
Paddle Serving的C++引擎支持模型管理、在线A/B流量测试、模型热加载等功能,当前在Python API还有没完全开放这部分功能的配置,敬请期待。
## 4. 用户类型
Paddle Serving面向的用户提供RPC和HTTP两种访问协议。对于HTTP协议,我们更倾向于流量中小型的服务使用,并且对延时没有严格要求的AI服务开发者。对于RPC协议,我们面向流量较大,对延时要求更高的用户,此外RPC的客户端可能也处在一个大系统的服务中,这种情况下非常适合使用Paddle Serving提供的RPC服务。对于使用分布式稀疏参数索引服务而言,Paddle Serving的用户不需要关心底层的细节,其调用本质也是通过RPC服务再调用RPC服务。下图给出了当前设计的Paddle Serving可能会使用Serving服务的几种场景。
<p align="center">
<br>
<img src='user_groups.png' width = "700" height = "470">
<br>
<p>
对于普通的模型而言(具体指通过Serving提供的IO保存的模型,并且没有对模型进行后处理),用户使用RPC服务不需要额外的开发即可实现服务启动,但需要开发一些Client端的代码来使用服务。对于Web服务的开发,需要用户现在Paddle Serving提供的Web Service框架中进行前后处理的开发,从而实现整个HTTP服务。
### 4.1 Web服务开发
Web服务有很多开源的框架,Paddle Serving当前集成了Flask框架,但这部分对用户不可见,在未来可能会提供性能更好的Web框架作为底层HTTP服务集成引擎。用户需要继承WebService,从而实现对rpc服务的输入输出进行加工的目的。
``` python
from paddle_serving_server.web_service import WebService
from imdb_reader import IMDBDataset
import sys
class IMDBService(WebService):
def prepare_dict(self, args={}):
if len(args) == 0:
exit(-1)
self.dataset = IMDBDataset()
self.dataset.load_resource(args["dict_file_path"])
def preprocess(self, feed={}, fetch=[]):
if "words" not in feed:
exit(-1)
res_feed = {}
res_feed["words"] = self.dataset.get_words_only(feed["words"])[0]
return res_feed, fetch
imdb_service = IMDBService(name="imdb")
imdb_service.load_model_config(sys.argv[1])
imdb_service.prepare_server(
workdir=sys.argv[2], port=int(sys.argv[3]), device="cpu")
imdb_service.prepare_dict({"dict_file_path": sys.argv[4]})
imdb_service.run_server()
```
`WebService`作为基类,提供将用户接受的HTTP请求转化为RPC输入的接口`preprocess`,同时提供对RPC请求返回的结果进行后处理的接口`postprocess`,继承`WebService`的子类,可以定义各种类型的成员函数。`WebService`的启动命令和普通RPC服务提供的启动API一致。
## 5. 未来计划
### 5.1 有向无环图结构定义开放
当前版本开放的python API仅支持用户定义Sequential类型的执行流,如果想要进行Server进程内复杂的计算,需要增加对应的用户API。
### 5.2 云端自动部署能力
为了方便用户更容易将Paddle的预测模型部署到线上,Paddle Serving在接下来的版本会提供Kubernetes生态下任务编排的工具。
### 5.3 向量检索、树结构检索
在推荐与广告场景的召回系统中,通常需要采用基于向量的快速检索或者基于树结构的快速检索,Paddle Serving会对这方面的检索引擎进行集成或扩展。
# Paddle Serving Design Doc
## 1. Design Objectives
- Long Term Vision: Online deployment of deep learning models will be a user-facing application in the future. Any AI developer will face the problem of deploying an online service for his or her trained model.
Paddle Serving is the official open source online deployment framework. The long term goal of Paddle Serving is to provide professional, reliable and easy-to-use online service to the last mile of AI application.
- Easy-To-Use: For algorithmic developers to quickly deploy their models online, Paddle Serving designs APIs that can be used with Paddle's training process seamlessly, most Paddle models can be deployed as a service with one line command.
- Industrial Oriented: To meet industrial deployment requirements, Paddle Serving supports lots of large-scale deployment functions: 1) Distributed Sparse Embedding Indexing. 2) Highly concurrent underlying communications. 3) Model Management, online A/B test, model online loading.
- Extensibility: Paddle Serving supports C++, Python and Golang client, and will support more clients with different languages. It is very easy to extend Paddle Serving to support other machine learning inference library, although currently Paddle inference library is the only official supported inference backend.
## 2. Module design and implementation
### 2.1 Python API interface design
#### 2.1.1 save a servable model
The inference phase of Paddle model focuses on 1) input variables of the model. 2) output variables of the model. 3) model structure and model parameters. Paddle Serving Python API provides a `save_model` interface for trained model, and save necessary information for Paddle Serving to use during deployment phase. An example is as follows:
``` python
import paddle_serving_client.io as serving_io
serving_io.save_model("serving_model", "client_conf",
{"words": data}, {"prediction": prediction},
fluid.default_main_program())
```
In the example, `{"words": data}` and `{"prediction": prediction}` assign the inputs and outputs of a model. `"words"` and `"prediction"` are alias names of inputs and outputs. The design of alias name is to help developers to memorize model inputs and model outputs. `data` and `prediction` are Paddle `[Variable](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Variable_cn.html#variable)` in training phase that often represents ([Tensor](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Tensor_cn.html#tensor)) or ([LodTensor](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor)). When the `save_model` API is called, two directories called `"serving_model"` and `"client_conf"` will be generated. The content of the saved model is as follows:
``` shell
.
├── client_conf
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
└── serving_model
├── embedding_0.w_0
├── fc_0.b_0
├── fc_0.w_0
├── fc_1.b_0
├── fc_1.w_0
├── fc_2.b_0
├── fc_2.w_0
├── lstm_0.b_0
├── lstm_0.w_0
├── __model__
├── serving_server_conf.prototxt
└── serving_server_conf.stream.prototxt
```
`"serving_client_conf.prototxt"` and `"serving_server_conf.prototxt"` are the client side and the server side configurations of Paddle Serving, and `"serving_client_conf.stream.prototxt"` and `"serving_server_conf.stream.prototxt"` are the corresponding parts. Other contents saved in the directory are the same as Paddle saved inference model. We are considering to support `save_model` interface in Paddle training framework so that a user is not aware of the servable configurations.
#### 2.1.2 Model loading on the server side
Prediction logics on the server side can be defined through Paddle Serving Server API with a few lines of code, an example is as follows:
``` python
import paddle_serving_server as serving
op_maker = serving.OpMaker()
read_op = op_maker.create('general_reader')
dist_kv_op = op_maker.create('general_dist_kv')
general_infer_op = op_maker.create('general_infer')
general_response_op = op_maker.create('general_response')
op_seq_maker = serving.OpSeqMaker()
op_seq_maker.add_op(read_op)
op_seq_maker.add_op(dist_kv_op)
op_seq_maker.add_op(general_infer_op)
op_seq_maker.add_op(general_response_op)
```
Current Paddle Serving supports operator list on the server side as follows:
<center>
| Op Name | Description |
|--------------|------|
| `general_reader` | General Data Reading Operator |
| `genreal_infer` | General Data Inference with Paddle Operator |
| `general_response` | General Data Response Operator |
| `general_dist_kv` | Distributed Sparse Embedding Indexing |
</center>
Paddle Serving supports inference engine on multiple devices. Current supports are CPU and GPU engine. Docker Images of CPU and GPU are provided officially. User can use one line command to start an inference service either on CPU or on GPU.
``` shell
python -m paddle_serving_server.serve --model your_servable_model --thread 10 --port 9292
```
``` shell
python -m paddle_serving_server_gpu.serve --model your_servable_model --thread 10 --port 9292
```
Options of startup command are listed below:
<center>
| Arguments | Types | Defaults | Descriptions |
|--------------|------|-----------|--------------------------------|
| `thread` | int | `4` | Concurrency on server side, usually equal to the number of CPU core |
| `port` | int | `9292` | Port exposed to users |
| `name` | str | `""` | Service name that if a user specifies, the name of HTTP service is allocated |
| `model` | str | `""` | Servable models for Paddle Serving |
| `gpu_ids` | str | `""` | Supported only in paddle_serving_server_gpu, similar to the usage of CUDA_VISIBLE_DEVICES |
</center>
For example, `python -m paddle_serving_server.serve --model your_servable_model --thread 10 --port 9292` is the same as the following code as user can define:
``` python
from paddle_serving_server import OpMaker, OpSeqMaker, Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
general_infer_op = op_maker.create('general_infer')
general_response_op = op_maker.create('general_response')
op_seq_maker = OpSeqMaker()
op_seq_maker.add_op(read_op)
op_seq_maker.add_op(general_infer_op)
op_seq_maker.add_op(general_response_op)
server = Server()
server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(10)
server.load_model_config(your_servable_model)
server.prepare_server(port=9292, device="cpu")
server.run_server()
```
#### 2.1.3 Paddle Serving Client API
Paddle Serving supports remote service access through RPC(remote procedure call) and HTTP. RPC access of remote service can be called through Client API of Paddle Serving. A user can define data preprocess function before calling Paddle Serving's client API. The example below explains how to define the input data of Paddle Serving Client. The servable model has two inputs with alias name of `sparse` and `dense`. `sparse` corresponds to sparse sequence ids such as `[1, 1001, 100001]` and `dense` corresponds to dense vector such as `[0.2, 0.5, 0.1, 0.4, 0.11, 0.22]`. For sparse sequence data, current design supports `lod_level=0` and `lod_level=1` of Paddle, that corresponds to `Tensor` and `LodTensor`. For dense vector, current design supports any `N-D Tensor`. Users do not need to assign the shape of inference model input. The Paddle Serving Client API will check the input data's shape with servable configurations.
``` python
feed_dict["sparse"] = [1, 1001, 100001]
feed_dict["dense"] = [0.2, 0.5, 0.1, 0.4, 0.11, 0.22]
fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
```
The following code sample shows that Paddle Serving Client API connects to Server API with endpoint of the servers. To use the data parallelism ability during prediction, Paddle Serving Client allows users to define multiple server endpoints.
``` python
client = Client()
client.load_client_config('servable_client_configs')
client.connect(["127.0.0.1:9292"])
```
### 2.2 Underlying Communication Mechanism
Paddle Serving adopts [baidu-rpc](https://github.com/apache/incubator-brpc) as underlying communication layer. baidu-rpc is an open-source RPC communication library with high concurrency and low latency advantages compared with other open source RPC library. Millions of instances and thousands of services are using baidu-rpc within Baidu.
### 2.3 Core Execution Engine
The core execution engine of Paddle Serving is a Directed acyclic graph(DAG). In the DAG, each node represents a phase of inference service, such as paddle inference prediction, data preprocessing and data postprocessing. DAG can fully parallelize the computation efficiency and can fully utilize the computation resources. For example, when a user has input data that needs to be feed into two models, and combine the scores of the two models, the computation of model scoring is parallelized through DAG.
<p align="center">
<br>
<img src='design_doc.png'">
<br>
<p>
### 2.4 Micro service plugin
The underlying communication of Paddle Serving is implemented with C++ as well as the core framework, it is hard for users who do not familiar with C++ to implement new Paddle Serving Server Operators. Another approach is to use the light-weighted Web Service in Paddle Serving Server that can be viewed as a plugin. A user can implement complex data preprocessing and postprocessing logics to build a complex AI service. If access of the AI service has a large volumn, it is worth to implement the service with high performance Paddle Serving Server operators. The relationship between Web Service and RPC Service can be referenced in `User Type`.
## 3. Industrial Features
### 3.1 Distributed Sparse Parameter Indexing
Distributed Sparse Parameter Indexing is commonly seen in advertising and recommendation scenarios, and is often used coupled with distributed training. The figure below explains a commonly seen architecture for online recommendation. When the recommendation service receives a request from a user, the system will automatically collects training log for the offline distributed online training. Mean while, the request is sent to Paddle Serving Server. For sparse features, distributed sparse parameter index service is called so that sparse parameters can be looked up. The dense input features together with the looked up sparse model parameters are fed into the Paddle Inference Node of the DAG in Paddle Serving Server. Then the score can be responsed through RPC to product service for item ranking.
<p align="center">
<br>
<img src='cube_eng.png' width = "450" height = "230">
<br>
<p>
Why do we need to support distributed sparse parameter indexing in Paddle Serving? 1) In some recommendation scenarios, the number of features can be up to hundreds of billions that a single node can not hold the parameters within random access memory. 2) Paddle Serving supports distributed sparse parameter indexing that can couple with paddle inference. Users do not need to do extra work to have a low latency inference engine with hundreds of billions of parameters.
### 3.2 Model Management, online A/B test, Model Online Reloading
Paddle Serving's C++ engine supports model management, online A/B test and model online reloading. Currently, python API is not released yet, please wait for the next release.
## 4. User Types
Paddle Serving provides RPC and HTTP protocol for users. For HTTP service, we recommend users with median or small traffic services to use, and the latency is not a strict requirement. For RPC protocol, we recommend high traffic services and low latency required services to use. For users who use distributed sparse parameter indexing built-in service, it is not necessary to care about the underlying details of communication. The following figure gives out several scenarios that user may want to use Paddle Serving.
<p align="center">
<br>
<img src='user_groups.png' width = "700" height = "470">
<br>
<p>
For servable models saved from Paddle Serving IO API, users do not need to do extra coding work to startup a service, but may need some coding work on the client side. For development of Web Service plugin, a user needs to provide implementation of Web Service's preprocessing and postprocessing work if needed to get a HTTP service.
### 4.1 Web Service Development
Web Service has lots of open sourced framework. Currently Paddle Serving uses Flask as built-in service framework, and users are not aware of this. More efficient web service will be integrated in the furture if needed.
``` python
from paddle_serving_server.web_service import WebService
from imdb_reader import IMDBDataset
import sys
class IMDBService(WebService):
def prepare_dict(self, args={}):
if len(args) == 0:
exit(-1)
self.dataset = IMDBDataset()
self.dataset.load_resource(args["dict_file_path"])
def preprocess(self, feed={}, fetch=[]):
if "words" not in feed:
exit(-1)
res_feed = {}
res_feed["words"] = self.dataset.get_words_only(feed["words"])[0]
return res_feed, fetch
imdb_service = IMDBService(name="imdb")
imdb_service.load_model_config(sys.argv[1])
imdb_service.prepare_server(
workdir=sys.argv[2], port=int(sys.argv[3]), device="cpu")
imdb_service.prepare_dict({"dict_file_path": sys.argv[4]})
imdb_service.run_server()
```
`WebService` is a Base Class, providing inheritable interfaces such `preprocess` and `postprocess` for users to implement. In the inherited class of `WebService` class, users can define any functions they want and the startup function interface is the same as RPC service.
## 5. Future Plan
### 5.1 Open DAG definition API
Current version of Paddle Serving Server supports sequential type of execution flow. DAG definition API can be more helpful to users on complex tasks.
### 5.2 Auto Deployment on Cloud
In order to make deployment more easily on public cloud, Paddle Serving considers to provides Operators on Kubernetes in submitting a service job.
### 5.3 Vector Indexing and Tree based Indexing
In recommendation and advertisement systems, it is commonly seen to use vector based index or tree based indexing service to do candidate retrievals. These retrieval tasks will be built-in services of Paddle Serving.
......@@ -193,6 +193,3 @@ total num: 25000
acc num: 22014
acc: 0.88056
```
......@@ -143,6 +143,3 @@ self.op_dict = {
"general_dist_kv": "GeneralDistKVOp"
}
```
......@@ -9,6 +9,9 @@ Paddle Serving是PaddlePaddle的在线预估服务框架,能够帮助开发者
Paddle Serving当前的develop版本支持轻量级Python API进行快速预测,并且与Paddle的训练可以打通。我们以最经典的波士顿房价预测为示例,完整说明在单机进行模型训练以及使用Paddle Serving进行模型部署的过程。
#### 安装
强烈建议您在Docker内构建Paddle Serving,请查看[如何在Docker中运行PaddleServing](doc/RUN_IN_DOCKER_CN.md)
```
pip install paddle-serving-client
pip install paddle-serving-server
......
# How to run PaddleServing in Docker
## Requirements
Docker (GPU version requires nvidia-docker to be installed on the GPU machine)
## CPU
### Get docker image
You can get images in two ways:
1. Pull image directly
```bash
docker pull hub.baidubce.com/ctr/paddleserving:0.1.3
```
2. Building image based on dockerfile
Create a new folder and copy [Dockerfile](../tools/Dockerfile) to this folder, and run the following command:
```bash
docker build -t hub.baidubce.com/ctr/paddleserving:0.1.3 .
```
### Create container
```bash
docker run -p 9292:9292 --name test -dit hub.baidubce.com/ctr/paddleserving:0.1.3
docker exec -it test bash
```
The `-p` option is to map the `9292` port of the container to the `9292` port of the host.
### Install PaddleServing
In order to make the image smaller, the PaddleServing package is not installed in the image. You can run the following command to install it
```bash
pip install paddle-serving-server
```
### Test example
Get the trained Boston house price prediction model by the following command:
```bash
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
```
- Test HTTP service
Running on the Server side (inside the container):
```bash
python -m paddle_serving_server.web_serve --model uci_housing_model --thread 10 --port 9292 --name uci &>std.log 2>err.log &
```
Running on the Client side (inside or outside the container):
```bash
curl -H "Content-Type:application/json" -X POST -d '{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332], "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
```
- Test RPC service
Running on the Server side (inside the container):
```bash
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 &>std.log 2>err.log &
```
Running following Python code on the Client side (inside or outside the container, The `paddle-serving-client` package needs to be installed):
```bash
from paddle_serving_client import Client
client = Client()
client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
client.connect(["127.0.0.1:9292"])
data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
fetch_map = client.predict(feed={"x": data}, fetch=["price"])
print(fetch_map)
```
## GPU
The GPU version is basically the same as the CPU version, with only some differences in interface naming (GPU version requires nvidia-docker to be installed on the GPU machine).
### Get docker image
You can also get images in two ways:
1. Pull image directly
```bash
nvidia-docker pull hub.baidubce.com/ctr/paddleserving:0.1.3-gpu
```
2. Building image based on dockerfile
Create a new folder and copy [Dockerfile.gpu](../tools/Dockerfile.gpu) to this folder, and run the following command:
```bash
nvidia-docker build -t hub.baidubce.com/ctr/paddleserving:0.1.3-gpu .
```
### Create container
```bash
nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/ctr/paddleserving:0.1.3-gpu
nvidia-docker exec -it test bash
```
The `-p` option is to map the `9292` port of the container to the `9292` port of the host.
### Install PaddleServing
In order to make the image smaller, the PaddleServing package is not installed in the image. You can run the following command to install it:
```bash
pip install paddle-serving-server-gpu
```
### Test example
Get the trained Boston house price prediction model by the following command:
```bash
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
```
- Test HTTP service
Running on the Server side (inside the container):
```bash
python -m paddle_serving_server_gpu.web_serve --model uci_housing_model --thread 10 --port 9292 --name uci
```
Running on the Client side (inside or outside the container):
```bash
curl -H "Content-Type:application/json" -X POST -d '{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332], "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
```
- Test RPC service
Running on the Server side (inside the container):
```bash
python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292
```
Running following Python code on the Client side (inside or outside the container, The `paddle-serving-client` package needs to be installed):
```bash
from paddle_serving_client import Client
client = Client()
client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
client.connect(["127.0.0.1:9292"])
data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
fetch_map = client.predict(feed={"x": data}, fetch=["price"])
print(fetch_map)
```
# 如何在Docker中运行PaddleServing
## 环境要求
Docker(GPU版本需要在GPU机器上安装nvidia-docker)
## CPU版本
### 获取镜像
可以通过两种方式获取镜像。
1. 直接拉取镜像
```bash
docker pull hub.baidubce.com/ctr/paddleserving:0.1.3
```
2. 基于Dockerfile构建镜像
建立新目录,复制[Dockerfile](../tools/Dockerfile)内容到该目录下Dockerfile文件。执行
```bash
docker build -t hub.baidubce.com/ctr/paddleserving:0.1.3 .
```
### 创建容器并进入
```bash
docker run -p 9292:9292 --name test -dit hub.baidubce.com/ctr/paddleserving:0.1.3
docker exec -it test bash
```
`-p`选项是为了将容器的`9292`端口映射到宿主机的`9292`端口。
### 安装PaddleServing
为了减小镜像的体积,镜像中没有安装Serving包,要执行下面命令进行安装
```bash
pip install paddle-serving-server
```
### 测试example
通过下面命令获取训练好的Boston房价预估模型:
```bash
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
```
- 测试HTTP服务
在Server端(容器内)运行:
```bash
python -m paddle_serving_server.web_serve --model uci_housing_model --thread 10 --port 9292 --name uci &>std.log 2>err.log &
```
在Client端(容器内或容器外)运行:
```bash
curl -H "Content-Type:application/json" -X POST -d '{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332], "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
```
- 测试RPC服务
在Server端(容器内)运行:
```bash
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 &>std.log 2>err.log &
```
在Client端(容器内或容器外,需要安装`paddle-serving-client`包)运行下面Python代码:
```python
from paddle_serving_client import Client
client = Client()
client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
client.connect(["127.0.0.1:9292"])
data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
fetch_map = client.predict(feed={"x": data}, fetch=["price"])
print(fetch_map)
```
## GPU版本
GPU版本与CPU版本基本一致,只有部分接口命名的差别(GPU版本需要在GPU机器上安装nvidia-docker)。
### 获取镜像
可以通过两种方式获取镜像。
1. 直接拉取镜像
```bash
nvidia-docker pull hub.baidubce.com/ctr/paddleserving:0.1.3-gpu
```
2. 基于Dockerfile构建镜像
建立新目录,复制[Dockerfile.gpu](../tools/Dockerfile.gpu)内容到该目录下Dockerfile文件。执行
```bash
nvidia-docker build -t hub.baidubce.com/ctr/paddleserving:0.1.3-gpu .
```
### 创建容器并进入
```bash
nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/ctr/paddleserving:0.1.3-gpu
nvidia-docker exec -it test bash
```
`-p`选项是为了将容器的`9292`端口映射到宿主机的`9292`端口。
### 安装PaddleServing
为了减小镜像的体积,镜像中没有安装Serving包,要执行下面命令进行安装
```bash
pip install paddle-serving-server-gpu
```
### 测试example
通过下面命令获取训练好的Boston房价预估模型:
```bash
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
```
- 测试HTTP服务
在Server端(容器内)运行:
```bash
python -m paddle_serving_server_gpu.web_serve --model uci_housing_model --thread 10 --port 9292 --name uci
```
在Client端(容器内或容器外)运行:
```bash
curl -H "Content-Type:application/json" -X POST -d '{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332], "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
```
- 测试RPC服务
在Server端(容器内)运行:
```bash
python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292
```
在Client端(容器内或容器外,需要安装`paddle-serving-client`包)运行下面Python代码:
```bash
from paddle_serving_client import Client
client = Client()
client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
client.connect(["127.0.0.1:9292"])
data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
fetch_map = client.predict(feed={"x": data}, fetch=["price"])
print(fetch_map)
```
## How to save a servable model of Paddle Serving?
- Currently, paddle serving provides a save_model interface for users to access, the interface is similar with `save_inference_model` of Paddle.
``` python
import paddle_serving_client.io as serving_io
serving_io.save_model("imdb_model", "imdb_client_conf",
{"words": data}, {"prediction": prediction},
fluid.default_main_program())
```
`imdb_model` is the server side model with serving configurations. `imdb_client_conf` is the client rpc configurations. Serving has a
dictionary for `Feed` and `Fetch` variables for client to assign. In the example, `{"words": data}` is the feed dict that specify the input of saved inference model. `{"prediction": prediction}` is the fetch dic that specify the output of saved inference model. An alias name can be defined for feed and fetch variables. An example of how to use alias name
is as follows:
``` python
from paddle_serving_client import Client
import sys
client = Client()
client.load_client_config(sys.argv[1])
client.connect(["127.0.0.1:9393"])
for line in sys.stdin:
group = line.strip().split()
words = [int(x) for x in group[1:int(group[0]) + 1]]
label = [int(group[-1])]
feed = {"words": words, "label": label}
fetch = ["acc", "cost", "prediction"]
fetch_map = client.predict(feed=feed, fetch=fetch)
print("{} {}".format(fetch_map["prediction"][1], label[0]))
```
......@@ -54,10 +54,3 @@ op_seq_maker.add_op(dist_kv_op)
op_seq_maker.add_op(general_infer_op)
op_seq_maker.add_op(general_response_op)
```
......@@ -2,7 +2,7 @@ FILE(GLOB fluid_cpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
add_library(fluid_cpu_engine ${fluid_cpu_engine_srcs})
target_include_directories(fluid_cpu_engine PUBLIC
${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
add_dependencies(fluid_cpu_engine pdserving extern_paddle configure kvdb)
add_dependencies(fluid_cpu_engine pdserving extern_paddle configure)
target_link_libraries(fluid_cpu_engine pdserving paddle_fluid -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
install(TARGETS fluid_cpu_engine
......
......@@ -21,8 +21,8 @@
#include <vector>
#include "core/configure/include/configure_parser.h"
#include "core/configure/inferencer_configure.pb.h"
#include "paddle_inference_api.h" // NOLINT
#include "core/predictor/framework/infer.h"
#include "paddle_inference_api.h" // NOLINT
namespace baidu {
namespace paddle_serving {
......@@ -336,7 +336,7 @@ class SigmoidModel {
return -1;
}
VLOG(2) << "load sigmoid_b [" << _sigmoid_b._params[0] << "] ["
<< _sigmoid_b._params[1] << "].";
<< _sigmoid_b._params[1] << "].";
_exp_max_input = exp_max;
_exp_min_input = exp_min;
return 0;
......@@ -373,7 +373,7 @@ class SigmoidFluidModel {
clone_model.reset(new SigmoidFluidModel());
clone_model->_sigmoid_core = _sigmoid_core;
clone_model->_fluid_core = _fluid_core->Clone();
return std::move(clone_model);
return std::move(clone_model); // NOLINT
}
public:
......@@ -459,7 +459,7 @@ class FluidCpuWithSigmoidCore : public FluidFamilyCore {
}
protected:
std::unique_ptr<SigmoidFluidModel> _core;
std::unique_ptr<SigmoidFluidModel> _core; // NOLINT
};
class FluidCpuNativeDirWithSigmoidCore : public FluidCpuWithSigmoidCore {
......
......@@ -2,7 +2,7 @@ FILE(GLOB fluid_gpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
add_library(fluid_gpu_engine ${fluid_gpu_engine_srcs})
target_include_directories(fluid_gpu_engine PUBLIC
${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
add_dependencies(fluid_gpu_engine pdserving extern_paddle configure kvdb)
add_dependencies(fluid_gpu_engine pdserving extern_paddle configure)
target_link_libraries(fluid_gpu_engine pdserving paddle_fluid iomp5 mklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
install(TARGETS fluid_gpu_engine
......
......@@ -23,8 +23,8 @@
#include <vector>
#include "core/configure/include/configure_parser.h"
#include "core/configure/inferencer_configure.pb.h"
#include "paddle_inference_api.h" // NOLINT
#include "core/predictor/framework/infer.h"
#include "paddle_inference_api.h" // NOLINT
DECLARE_int32(gpuid);
......@@ -334,13 +334,13 @@ class SigmoidModel {
return -1;
}
VLOG(2) << "load sigmoid_w [" << _sigmoid_w._params[0] << "] ["
<< _sigmoid_w._params[1] << "].";
<< _sigmoid_w._params[1] << "].";
if (0 != _sigmoid_b.init(2, 1, sigmoid_b_file) || 0 != _sigmoid_b.load()) {
LOG(ERROR) << "load params sigmoid_b failed.";
return -1;
}
VLOG(2) << "load sigmoid_b [" << _sigmoid_b._params[0] << "] ["
<< _sigmoid_b._params[1] << "].";
<< _sigmoid_b._params[1] << "].";
_exp_max_input = exp_max;
_exp_min_input = exp_min;
return 0;
......
if (CLIENT_ONLY)
if (CLIENT)
file(GLOB_RECURSE SERVING_CLIENT_PY_FILES paddle_serving_client/*.py)
set(PY_FILES ${SERVING_CLIENT_PY_FILES})
SET(PACKAGE_NAME "serving_client")
set(SETUP_LOG_FILE "setup.py.client.log")
endif()
if (NOT CLIENT_ONLY)
if (SERVER)
if (NOT WITH_GPU)
file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
else()
......@@ -16,12 +16,17 @@ if (NOT CLIENT_ONLY)
set(SETUP_LOG_FILE "setup.py.server.log")
endif()
if (CLIENT_ONLY)
if (CLIENT)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.client.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
endif()
if (NOT CLIENT_ONLY)
if (APP)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.app.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
endif()
if (SERVER)
if (NOT WITH_GPU)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
......@@ -31,21 +36,28 @@ if (NOT CLIENT_ONLY)
endif()
endif()
set(SERVING_CLIENT_CORE ${PADDLE_SERVING_BINARY_DIR}/core/general-client/serving_client.so)
set (SERVING_CLIENT_CORE ${PADDLE_SERVING_BINARY_DIR}/core/general-client/*.so)
message("python env: " ${py_env})
if (CLIENT_ONLY)
if (APP)
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_app/ ${PADDLE_SERVING_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel)
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
endif()
if (CLIENT)
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_client/ ${PADDLE_SERVING_BINARY_DIR}/python/
COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SERVING_BINARY_DIR}/core/general-client/serving_client.so ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/
COMMAND ${CMAKE_COMMAND} -E copy ${SERVING_CLIENT_CORE} ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/serving_client.so
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
DEPENDS ${SERVING_CLIENT_CORE} sdk_configure_py_proto ${PY_FILES})
add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
endif()
if (NOT CLIENT_ONLY)
if (SERVER)
if(NOT WITH_GPU)
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
......@@ -67,20 +79,22 @@ endif()
set(SERVING_CLIENT_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
set(SERVING_SERVER_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
if (CLIENT_ONLY)
if (CLIENT)
install(DIRECTORY ${SERVING_CLIENT_PYTHON_PACKAGE_DIR}
DESTINATION opt/serving_client/share/wheels
)
endif()
if (NOT CLIENT_ONLY)
if (SERVER)
install(DIRECTORY ${SERVING_SERVER_PYTHON_PACKAGE_DIR}
DESTINATION opt/serving_server/share/wheels
)
endif()
if (CLIENT OR SERVER)
find_program(PATCHELF_EXECUTABLE patchelf)
if(NOT PATCHELF_EXECUTABLE)
if (NOT PATCHELF_EXECUTABLE)
message(FATAL_ERROR "patchelf not found, please install it.\n"
"For Ubuntu, the command is: apt-get install -y patchelf.")
endif()
endif()
## 语义理解预测服务
示例中采用BERT模型进行语义理解预测,将文本表示为向量的形式,可以用来做进一步的分析和预测。
### 获取模型
示例中采用[Paddlehub](https://github.com/PaddlePaddle/PaddleHub)中的[BERT中文模型](https://www.paddlepaddle.org.cn/hubdetail?name=bert_chinese_L-12_H-768_A-12&en_category=SemanticModel)
执行
```
python prepare_model.py
```
生成server端配置文件与模型文件,存放在serving_server_model文件夹
生成client端配置文件,存放在serving_client_conf文件夹
### 获取词典和样例数据
```
sh get_data.sh
```
脚本将下载中文词典vocab.txt和中文样例数据data-c.txt
### 启动RPC预测服务
执行
```
python -m paddle_serving_server.serve --model serving_server_model/ --port 9292 #启动cpu预测服务
```
或者
```
python -m paddle_serving_server_gpu.serve --model serving_server_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
```
### 执行预测
```
python bert_rpc_client.py --thread 4
```
启动client读取data-c.txt中的数据进行预测,--thread参数控制client的进程数,预测结束后会打印出每个进程的耗时,server端的地址在脚本中修改。
### 启动HTTP预测服务
```
export CUDA_VISIBLE_DEVICES=0,1
```
通过环境变量指定gpu预测服务使用的gpu,示例中指定索引为0和1的两块gpu
```
python bert_web_service.py serving_server_model/ 9292 #启动gpu预测服务
```
### 执行预测
```
curl -H "Content-Type:application/json" -X POST -d '{"words": "hello", "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
```
### Benchmark
模型:bert_chinese_L-12_H-768_A-12
设备:GPU V100 * 1
环境:CUDA 9.2,cudnn 7.1.4
测试中将样例数据中的1W个样本复制为10W个样本,每个client线程发送线程数分之一个样本,batch size为1,max_seq_len为20,时间单位为秒.
在client线程数为4时,预测速度可以达到432样本每秒。
由于单张GPU内部只能串行计算,client线程增多只能减少GPU的空闲时间,因此在线程数达到4之后,线程数增多对预测速度没有提升。
| client thread num | prepro | client infer | op0 | op1 | op2 | postpro | total |
| ------------------ | ------ | ------------ | ----- | ------ | ---- | ------- | ------ |
| 1 | 3.05 | 290.54 | 0.37 | 239.15 | 6.43 | 0.71 | 365.63 |
| 4 | 0.85 | 213.66 | 0.091 | 200.39 | 1.62 | 0.2 | 231.45 |
| 8 | 0.42 | 223.12 | 0.043 | 110.99 | 0.8 | 0.098 | 232.05 |
| 12 | 0.32 | 225.26 | 0.029 | 73.87 | 0.53 | 0.078 | 231.45 |
| 16 | 0.23 | 227.26 | 0.022 | 55.61 | 0.4 | 0.056 | 231.9 |
总耗时变化规律如下:
![bert benchmark](../../../doc/bert-benchmark-batch-size-1.png)
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def prepare_batch_data(insts,
total_token_num,
max_seq_len=128,
pad_id=None,
cls_id=None,
sep_id=None,
mask_id=None,
return_input_mask=True,
return_max_len=True,
return_num_token=False):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids = [inst[0] for inst in insts]
batch_sent_ids = [inst[1] for inst in insts]
batch_pos_ids = [inst[2] for inst in insts]
labels_list = []
# compatible with squad, whose example includes start/end positions,
# or unique id
for i in range(3, len(insts[0]), 1):
labels = [inst[i] for inst in insts]
labels = np.array(labels).astype("int64").reshape([-1, 1])
labels_list.append(labels)
out = batch_src_ids
# Second step: padding
src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, max_seq_len=max_seq_len, return_input_mask=True)
pos_id = pad_batch_data(
batch_pos_ids,
pad_idx=pad_id,
max_seq_len=max_seq_len,
return_pos=False,
return_input_mask=False)
sent_id = pad_batch_data(
batch_sent_ids,
pad_idx=pad_id,
max_seq_len=max_seq_len,
return_pos=False,
return_input_mask=False)
return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
return return_list if len(return_list) > 1 else return_list[0]
def pad_batch_data(insts,
pad_idx=0,
max_seq_len=128,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False,
return_seq_lens=False):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list = []
#max_len = max(len(inst) for inst in insts)
max_len = max_seq_len
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array([
list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
])
return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
# position data
if return_pos:
inst_pos = np.array([
list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array(
[[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
if return_seq_lens:
seq_lens = np.array([len(inst) for inst in insts])
return_list += [seq_lens.astype("int64").reshape([-1, 1])]
return return_list if len(return_list) > 1 else return_list[0]
# -*- coding: utf-8 -*-
#
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from __future__ import unicode_literals, absolute_import
import os
import sys
import time
from paddle_serving_client import Client
from paddle_serving_client.utils import MultiThreadRunner
from paddle_serving_client.utils import benchmark_args
from batching import pad_batch_data
import tokenization
import requests
import json
from bert_reader import BertReader
args = benchmark_args()
def single_func(idx, resource):
fin = open("data-c.txt")
dataset = []
for line in fin:
dataset.append(line.strip())
if args.request == "rpc":
reader = BertReader(vocab_file="vocab.txt", max_seq_len=20)
fetch = ["pooled_output"]
client = Client()
client.load_client_config(args.model)
client.connect([resource["endpoint"][idx % len(resource["endpoint"])]])
start = time.time()
for i in range(1000):
if args.batch_size == 1:
feed_dict = reader.process(dataset[i])
result = client.predict(feed=feed_dict, fetch=fetch)
else:
print("unsupport batch size {}".format(args.batch_size))
elif args.request == "http":
start = time.time()
header = {"Content-Type": "application/json"}
for i in range(1000):
dict_data = {"words": dataset[i], "fetch": ["pooled_output"]}
r = requests.post(
'http://{}/bert/prediction'.format(resource["endpoint"][
idx % len(resource["endpoint"])]),
data=json.dumps(dict_data),
headers=header)
end = time.time()
return [[end - start]]
if __name__ == '__main__':
multi_thread_runner = MultiThreadRunner()
endpoint_list = ["127.0.0.1:9292"]
result = multi_thread_runner.run(single_func, args.thread,
{"endpoint": endpoint_list})
avg_cost = 0
for i in range(args.thread):
avg_cost += result[0][i]
avg_cost = avg_cost / args.thread
print("average total cost {} s.".format(avg_cost))
rm profile_log
for thread_num in 1 2 4 8 16
do
$PYTHONROOT/bin/python benchmark.py --thread $thread_num --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1
echo "========================================"
echo "batch size : $batch_size" >> profile_log
$PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
tail -n 1 profile >> profile_log
done
# -*- coding: utf-8 -*-
#
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from __future__ import unicode_literals, absolute_import
import os
import sys
import time
from paddle_serving_client import Client
from paddle_serving_client.utils import MultiThreadRunner
from paddle_serving_client.utils import benchmark_args
from batching import pad_batch_data
import tokenization
import requests
import json
from bert_reader import BertReader
args = benchmark_args()
def single_func(idx, resource):
fin = open("data-c.txt")
dataset = []
for line in fin:
dataset.append(line.strip())
if args.request == "rpc":
reader = BertReader(vocab_file="vocab.txt", max_seq_len=20)
fetch = ["pooled_output"]
client = Client()
client.load_client_config(args.model)
client.connect([resource["endpoint"][idx % len(resource["endpoint"])]])
feed_batch = []
for bi in range(args.batch_size):
feed_batch.append(reader.process(dataset[bi]))
start = time.time()
for i in range(1000):
if args.batch_size >= 1:
result = client.batch_predict(
feed_batch=feed_batch, fetch=fetch)
else:
print("unsupport batch size {}".format(args.batch_size))
elif args.request == "http":
raise ("no batch predict for http")
end = time.time()
return [[end - start]]
if __name__ == '__main__':
multi_thread_runner = MultiThreadRunner()
endpoint_list = [
"127.0.0.1:9295", "127.0.0.1:9296", "127.0.0.1:9297", "127.0.0.1:9298"
]
result = multi_thread_runner.run(single_func, args.thread,
{"endpoint": endpoint_list})
avg_cost = 0
for i in range(args.thread):
avg_cost += result[0][i]
avg_cost = avg_cost / args.thread
print("average total cost {} s.".format(avg_cost))
rm profile_log
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
sleep 5
for thread_num in 1 2 4 8 16
do
for batch_size in 1 2 4 8 16 32 64 128 256 512
do
$PYTHONROOT/bin/python benchmark_batch.py --thread $thread_num --batch_size $batch_size --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1
echo "========================================"
echo "thread num: ", $thread_num
echo "batch size: ", $batch_size
echo "batch size : $batch_size" >> profile_log
$PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
tail -n 1 profile >> profile_log
done
done
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册