提交 0cef40a5 编写于 作者: H HexToString

Merge branch 'develop' of github.com:HexToString/Serving into develop

......@@ -49,6 +49,9 @@ set(THIRD_PARTY_BUILD_TYPE Release)
option(WITH_AVX "Compile Paddle Serving with AVX intrinsics" OFF)
option(WITH_MKL "Compile Paddle Serving with MKL support." OFF)
option(WITH_GPU "Compile Paddle Serving with NVIDIA GPU" OFF)
option(WITH_LITE "Compile Paddle Serving with Paddle Lite Engine" OFF)
option(WITH_XPU "Compile Paddle Serving with Baidu Kunlun" OFF)
option(WITH_PYTHON "Compile Paddle Serving with Python" ON)
option(CLIENT "Compile Paddle Serving Client" OFF)
option(SERVER "Compile Paddle Serving Server" OFF)
option(APP "Compile Paddle Serving App package" OFF)
......@@ -66,40 +69,40 @@ if (NOT DEFINED WITH_MKLDNN)
endif()
endif()
if (SERVER)
include(external/jsoncpp)
#include(external/rocksdb)
endif()
if (SERVER OR CLIENT)
include(external/snappy)
include(external/leveldb)
include(external/zlib)
include(external/boost)
include(external/protobuf)
include(external/brpc)
include(external/gflags)
include(external/glog)
include(external/pybind11)
include(external/python)
include(generic)
include(flags)
include(external/snappy)
include(external/leveldb)
include(external/zlib)
include(external/boost)
include(external/protobuf)
include(external/brpc)
include(external/gflags)
include(external/glog)
if (WITH_PYTHON)
include(external/pybind11)
include(external/python)
endif()
include(generic)
include(flags)
endif()
if (APP)
include(external/zlib)
include(external/boost)
include(external/protobuf)
include(external/gflags)
include(external/glog)
include(external/pybind11)
include(external/python)
include(generic)
include(external/zlib)
include(external/boost)
include(external/protobuf)
include(external/gflags)
include(external/glog)
include(external/pybind11)
include(external/python)
include(generic)
endif()
if (SERVER)
include(external/cudnn)
include(paddlepaddle)
include(external/jsoncpp)
#include(external/rocksdb)
include(external/cudnn)
include(paddlepaddle)
endif()
message("paddle serving source dir: " ${PADDLE_SERVING_SOURCE_DIR})
......@@ -125,26 +128,24 @@ set(EXTERNAL_LIBS
)
if(SERVER)
if(WITH_MKLML)
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
endif()
endif()
if(SERVER)
if(WITH_MKLDNN)
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
endif()
endif()
if (SERVER)
list(APPEND EXTERNAL_LIBS paddlepaddle)
endif()
add_subdirectory(core)
if(SERVER)
add_subdirectory(paddle_inference)
add_subdirectory(paddle_inference)
endif()
add_subdirectory(python)
if (WITH_PYTHON)
add_subdirectory(python)
endif()
......@@ -47,9 +47,10 @@ nvidia-docker exec -it test bash
```shell
pip install paddle-serving-client==0.4.0
pip install paddle-serving-server==0.4.0 # CPU
pip install paddle-serving-app==0.2.0
pip install paddle-serving-server-gpu==0.4.0.post9 # GPU with CUDA9.0
pip install paddle-serving-server-gpu==0.4.0.post10 # GPU with CUDA10.0
pip install paddle-serving-server-gpu==0.4.0.trt # GPU with CUDA10.1+TensorRT
pip install paddle-serving-server-gpu==0.4.0.100 # GPU with CUDA10.1+TensorRT
```
You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source, add `-i https://pypi.tuna.tsinghua.edu.cn/simple` to pip command) to speed up the download.
......@@ -66,15 +67,6 @@ For **Windows Users**, please read the document [Paddle Serving for Windows User
<h2 align="center"> Pre-built services with Paddle Serving</h2>
<h3 align="center">Latest release</h4>
<p align="center">
<a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/ocr">Optical Character Recognition</a>
<br>
<a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/faster_rcnn_model">Object Detection</a>
<br>
<a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/deeplabv3">Image Segmentation</a>
<p>
<h3 align="center">Chinese Word Segmentation</h4>
``` shell
......@@ -133,7 +125,8 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
| `use_trt` (Only for trt version) | - | - | Run inference with TensorRT |
</center>
``` python
```python
# A user can visit rpc service through paddle_serving_client API
from paddle_serving_client import Client
import numpy as np
......@@ -147,13 +140,6 @@ print(fetch_map)
```
Here, `client.predict` function has two arguments. `feed` is a `python dict` with model input variable alias name and values. `fetch` assigns the prediction variables to be returned from servers. In the example, the name of `"x"` and `"price"` are assigned when the servable model is saved during training.
<h2 align="center">Some Key Features of Paddle Serving</h2>
- Integrate with Paddle training pipeline seamlessly, most paddle models can be deployed **with one line command**.
- **Industrial serving features** supported, such as models management, online loading, online A/B testing etc.
- **Distributed Key-Value indexing** supported which is especially useful for large scale sparse features as model inputs.
- **Highly concurrent and efficient communication** between clients and servers supported.
- **Multiple programming languages** supported on client side, such as Golang, C++ and python.
### WEB service
......@@ -189,6 +175,14 @@ the response is
{"result":{"price":[[18.901151657104492]]}}
```
<h2 align="center">Some Key Features of Paddle Serving</h2>
- Integrate with Paddle training pipeline seamlessly, most paddle models can be deployed **with one line command**.
- **Industrial serving features** supported, such as models management, online loading, online A/B testing etc.
- **Distributed Key-Value indexing** supported which is especially useful for large scale sparse features as model inputs.
- **Highly concurrent and efficient communication** between clients and servers supported.
- **Multiple programming languages** supported on client side, such as Golang, C++ and python.
<h2 align="center">Document</h2>
### New to Paddle Serving
......@@ -235,6 +229,10 @@ To connect with other users and contributors, welcome to join our [Slack channel
If you want to contribute code to Paddle Serving, please reference [Contribution Guidelines](doc/CONTRIBUTE.md)
- Special Thanks to [@BeyondYourself](https://github.com/BeyondYourself) in complementing the gRPC tutorial, updating the FAQ doc and modifying the mdkir command
- Special Thanks to [@mcl-stone](https://github.com/mcl-stone) in updating faster_rcnn benchmark
- Special Thanks to [@cg82616424](https://github.com/cg82616424) in updating the unet benchmark and modifying resize comment error
### Feedback
For any feedback or to report a bug, please propose a [GitHub Issue](https://github.com/PaddlePaddle/Serving/issues).
......
......@@ -49,9 +49,10 @@ nvidia-docker exec -it test bash
```shell
pip install paddle-serving-client==0.4.0
pip install paddle-serving-server==0.4.0 # CPU
pip install paddle-serving-app==0.2.0
pip install paddle-serving-server-gpu==0.4.0.post9 # GPU with CUDA9.0
pip install paddle-serving-server-gpu==0.4.0.post10 # GPU with CUDA10.0
pip install paddle-serving-server-gpu==0.4.0.trt # GPU with CUDA10.1+TensorRT
pip install paddle-serving-server-gpu==0.4.0.100 # GPU with CUDA10.1+TensorRT
```
您可能需要使用国内镜像源(例如清华源, 在pip命令中添加`-i https://pypi.tuna.tsinghua.edu.cn/simple`)来加速下载。
......@@ -148,7 +149,7 @@ print(fetch_map)
在这里,`client.predict`函数具有两个参数。 `feed`是带有模型输入变量别名和值的`python dict``fetch`被要从服务器返回的预测变量赋值。 在该示例中,在训练过程中保存可服务模型时,被赋值的tensor名为`"x"``"price"`
<h3 align="center">HTTP服务</h3>
用户也可以将数据格式处理逻辑放在服务器端进行,这样就可以直接用curl去访问服务,参考如下案例,在目录``python/examples/fit_a_line``
用户也可以将数据格式处理逻辑放在服务器端进行,这样就可以直接用curl去访问服务,参考如下案例,在目录`python/examples/fit_a_line`
```python
from paddle_serving_server.web_service import WebService
......@@ -232,6 +233,10 @@ curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1
如果您想为Paddle Serving贡献代码,请参考 [Contribution Guidelines](doc/CONTRIBUTE.md)
- 特别感谢 [@BeyondYourself](https://github.com/BeyondYourself) 提供grpc教程,更新FAQ教程,整理文件目录。
- 特别感谢 [@mcl-stone](https://github.com/mcl-stone) 提供faster rcnn benchmark脚本
- 特别感谢 [@cg82616424](https://github.com/cg82616424) 提供unet benchmark脚本和修改部分注释错误
### 反馈
如有任何反馈或是bug,请在 [GitHub Issue](https://github.com/PaddlePaddle/Serving/issues)提交
......
......@@ -22,8 +22,9 @@ set(BOOST_PROJECT "extern_boost")
# version of boost, say, 1.66.0, doesn't build on CentOS 6. We
# checked that the devtools package of CentOS 6 installs boost 1.41.0.
# So we use 1.41.0 here.
set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
set(BOOST_VER "1.74.0")
set(BOOST_TAR "boost_1_74_0" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
......
......@@ -13,6 +13,9 @@
# limitations under the License.
INCLUDE(ExternalProject)
set(BRPC_CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-narrowing")
set(BRPC_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(BRPC_CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -Wno-narrowing")
find_package(OpenSSL REQUIRED)
......@@ -35,19 +38,28 @@ INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
if(WITH_LITE)
set(BRPC_REPO "https://github.com/zhangjun/incubator-brpc.git")
set(BRPC_TAG "master")
else()
set(BRPC_REPO "https://github.com/wangjiawei04/brpc")
set(BRPC_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47")
endif()
# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF
ExternalProject_Add(
extern_brpc
${EXTERNAL_PROJECT_LOG_ARGS}
# TODO(gongwb): change to de newst repo when they changed.
GIT_REPOSITORY "https://github.com/wangjiawei04/brpc"
GIT_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47"
GIT_REPOSITORY ${BRPC_REPO}
GIT_TAG ${BRPC_TAG}
PREFIX ${BRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_CXX_FLAGS=${BRPC_CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${BRPC_CMAKE_C_FLAGS}
-DCMAKE_CPP_FLAGS=${BRPC_CMAKE_CPP_FLAGS}
-DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
......
......@@ -93,7 +93,11 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
if(NOT APPLE)
find_package(Threads REQUIRED)
link_libraries(${CMAKE_THREAD_LIBS_INIT})
if(WITH_LITE OR WITH_XPU)
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -fopenmp -pthread -ldl -lrt")
else()
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
endif()
endif(NOT APPLE)
set_property(GLOBAL PROPERTY FLUID_MODULES "")
......
......@@ -31,14 +31,20 @@ message( "WITH_GPU = ${WITH_GPU}")
# Paddle Version should be one of:
# latest: latest develop build
# version number like 1.5.2
SET(PADDLE_VERSION "1.8.4")
#SET(PADDLE_VERSION "2.0.0-rc1")
SET(PADDLE_VERSION "latest")
if (WITH_GPU)
if (WITH_TRT)
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6")
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7-avx-mkl-trt6")
else()
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl")
endif()
elseif (WITH_LITE)
if (WITH_XPU)
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm-xpu")
else()
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm")
endif()
else()
if (WITH_AVX)
if (WITH_MKLML)
......@@ -51,7 +57,12 @@ else()
endif()
endif()
SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/fluid_inference.tgz")
if(WITH_LITE)
SET(PADDLE_LIB_PATH "http://paddle-serving.bj.bcebos.com/inferlib/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
else()
SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
endif()
MESSAGE(STATUS "PADDLE_LIB_PATH=${PADDLE_LIB_PATH}")
if (WITH_GPU OR WITH_MKLML)
if (WITH_TRT)
......@@ -114,25 +125,48 @@ ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a)
ADD_LIBRARY(paddle_fluid SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.so)
SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a)
if (WITH_TRT)
ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so)
ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so)
ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so)
endif()
ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so)
if (WITH_LITE)
ADD_LIBRARY(paddle_api_full_bundled STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET paddle_api_full_bundled PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/lite/cxx/lib/libpaddle_api_full_bundled.a)
if (WITH_XPU)
ADD_LIBRARY(xpuapi SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET xpuapi PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xpu/lib/libxpuapi.so)
ADD_LIBRARY(xpurt SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET xpurt PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xpu/lib/libxpurt.so)
endif()
endif()
ADD_LIBRARY(xxhash STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET xxhash PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xxhash/lib/libxxhash.a)
ADD_LIBRARY(cryptopp STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET cryptopp PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/cryptopp/lib/libcryptopp.a)
LIST(APPEND external_project_dependencies paddle)
LIST(APPEND paddle_depend_libs
xxhash)
xxhash cryptopp)
if(WITH_LITE)
LIST(APPEND paddle_depend_libs paddle_api_full_bundled)
if(WITH_XPU)
LIST(APPEND paddle_depend_libs xpuapi xpurt)
endif()
endif()
if(WITH_TRT)
LIST(APPEND paddle_depend_libs
LIST(APPEND paddle_depend_libs
nvinfer nvinfer_plugin)
endif()
......@@ -14,10 +14,6 @@ list(APPEND configure_srcs ${CMAKE_CURRENT_LIST_DIR}/src/configure_parser.cpp)
add_library(configure ${configure_srcs})
add_dependencies(configure brpc)
add_executable(test_configure
${CMAKE_CURRENT_LIST_DIR}/tests/test_configure.cpp)
target_link_libraries(test_configure configure protobuf)
install(TARGETS configure
ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
)
......@@ -31,6 +27,8 @@ install(FILES ${inc}
DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure)
endif()
if (WITH_PYTHON)
py_proto_compile(general_model_config_py_proto SRCS proto/general_model_config.proto)
add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
......@@ -45,19 +43,19 @@ add_custom_target(sdk_configure_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E to
add_dependencies(sdk_configure_py_proto sdk_configure_py_proto_init)
add_custom_command(TARGET sdk_configure_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMENT "Copy generated python proto into directory paddle_serving_client/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_custom_command(TARGET general_model_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMENT "Copy generated general_model_config proto file into directory paddle_serving_client/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_client/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
......@@ -65,7 +63,7 @@ endif()
if (APP)
add_custom_command(TARGET general_model_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
COMMENT "Copy generated general_model_config proto file into directory paddle_serving_app/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
......@@ -74,29 +72,29 @@ if (SERVER)
py_proto_compile(server_config_py_proto SRCS proto/server_configure.proto)
add_custom_target(server_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(server_config_py_proto server_config_py_proto_init)
if (NOT WITH_GPU)
if (NOT WITH_GPU AND NOT WITH_LITE)
add_custom_command(TARGET server_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMENT "Copy generated python proto into directory paddle_serving_server/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINRARY_DIR})
add_custom_command(TARGET general_model_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
else()
add_custom_command(TARGET server_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory
${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMAND cp *.py
COMMAND cp -f *.py
${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMENT "Copy generated python proto into directory
paddle_serving_server_gpu/proto."
......@@ -105,7 +103,7 @@ add_custom_command(TARGET server_config_py_proto POST_BUILD
add_custom_command(TARGET general_model_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory
${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMAND cp *.py
COMMAND cp -f *.py
${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMENT "Copy generated general_model_config proto file into directory
paddle_serving_server_gpu/proto."
......@@ -113,8 +111,10 @@ add_custom_command(TARGET general_model_config_py_proto POST_BUILD
add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server_gpu/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
endif()
endif()
......@@ -45,6 +45,8 @@ message EngineDesc {
optional bool force_update_static_cache = 15;
optional bool enable_ir_optimization = 16;
optional bool use_trt = 17;
optional bool use_lite = 18;
optional bool use_xpu = 19;
};
// model_toolkit conf
......
......@@ -6,6 +6,11 @@ add_dependencies(serving pdcodegen fluid_cpu_engine pdserving paddle_fluid cube-
if (WITH_GPU)
add_dependencies(serving fluid_gpu_engine)
endif()
if (WITH_LITE)
add_dependencies(serving fluid_arm_engine)
endif()
target_include_directories(serving PUBLIC
${CMAKE_CURRENT_BINARY_DIR}/../../core/predictor
)
......@@ -15,6 +20,11 @@ if(WITH_GPU)
-Wl,--no-whole-archive)
endif()
if(WITH_LITE)
target_link_libraries(serving -Wl,--whole-archive fluid_arm_engine
-Wl,--no-whole-archive)
endif()
target_link_libraries(serving -Wl,--whole-archive fluid_cpu_engine
-Wl,--no-whole-archive)
......
......@@ -38,145 +38,7 @@ using baidu::paddle_serving::predictor::general_model::FetchInst;
using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
int GeneralDistKVInferOp::inference() {
VLOG(2) << "Going to run inference";
const std::vector<std::string> pre_node_names = pre_names();
if (pre_node_names.size() != 1) {
LOG(ERROR) << "This op(" << op_name()
<< ") can only have one predecessor op, but received "
<< pre_node_names.size();
return -1;
}
const std::string pre_name = pre_node_names[0];
const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
uint64_t log_id = input_blob->GetLogId();
VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
GeneralBlob *output_blob = mutable_data<GeneralBlob>();
if (!input_blob) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed mutable depended argument, op:" << pre_name;
return -1;
}
const TensorVector *in = &input_blob->tensor_vector;
TensorVector *out = &output_blob->tensor_vector;
int batch_size = input_blob->GetBatchSize();
VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
std::vector<uint64_t> keys;
std::vector<rec::mcube::CubeValue> values;
int sparse_count = 0;
int dense_count = 0;
std::vector<std::pair<int64_t *, size_t>> dataptr_size_pairs;
size_t key_len = 0;
for (size_t i = 0; i < in->size(); ++i) {
if (in->at(i).dtype != paddle::PaddleDType::INT64) {
++dense_count;
continue;
}
++sparse_count;
size_t elem_num = 1;
for (size_t s = 0; s < in->at(i).shape.size(); ++s) {
elem_num *= in->at(i).shape[s];
}
key_len += elem_num;
int64_t *data_ptr = static_cast<int64_t *>(in->at(i).data.data());
dataptr_size_pairs.push_back(std::make_pair(data_ptr, elem_num));
}
keys.resize(key_len);
int key_idx = 0;
for (size_t i = 0; i < dataptr_size_pairs.size(); ++i) {
std::copy(dataptr_size_pairs[i].first,
dataptr_size_pairs[i].first + dataptr_size_pairs[i].second,
keys.begin() + key_idx);
key_idx += dataptr_size_pairs[i].second;
}
Timer timeline;
int64_t cube_start = timeline.TimeStampUS();
timeline.Start();
rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
std::vector<std::string> table_names = cube->get_table_names();
if (table_names.size() == 0) {
LOG(ERROR) << "(logid=" << log_id
<< ") cube init error or cube config not given.";
return -1;
}
int ret = cube->seek(table_names[0], keys, &values);
int64_t cube_end = timeline.TimeStampUS();
if (values.size() != keys.size() || values[0].buff.size() == 0) {
LOG(ERROR) << "(logid=" << log_id << ") cube value return null";
}
size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
TensorVector sparse_out;
sparse_out.resize(sparse_count);
TensorVector dense_out;
dense_out.resize(dense_count);
int cube_val_idx = 0;
int sparse_idx = 0;
int dense_idx = 0;
std::unordered_map<int, int> in_out_map;
baidu::paddle_serving::predictor::Resource &resource =
baidu::paddle_serving::predictor::Resource::instance();
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config();
for (size_t i = 0; i < in->size(); ++i) {
if (in->at(i).dtype != paddle::PaddleDType::INT64) {
dense_out[dense_idx] = in->at(i);
++dense_idx;
continue;
}
sparse_out[sparse_idx].lod.resize(in->at(i).lod.size());
for (size_t x = 0; x < sparse_out[sparse_idx].lod.size(); ++x) {
sparse_out[sparse_idx].lod[x].resize(in->at(i).lod[x].size());
std::copy(in->at(i).lod[x].begin(),
in->at(i).lod[x].end(),
sparse_out[sparse_idx].lod[x].begin());
}
sparse_out[sparse_idx].dtype = paddle::PaddleDType::FLOAT32;
sparse_out[sparse_idx].shape.push_back(
sparse_out[sparse_idx].lod[0].back());
sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
sparse_out[sparse_idx].name = model_config->_feed_name[i];
sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
EMBEDDING_SIZE * sizeof(float));
float *dst_ptr = static_cast<float *>(sparse_out[sparse_idx].data.data());
for (int x = 0; x < sparse_out[sparse_idx].lod[0].back(); ++x) {
float *data_ptr = dst_ptr + x * EMBEDDING_SIZE;
memcpy(data_ptr,
values[cube_val_idx].buff.data(),
values[cube_val_idx].buff.size());
cube_val_idx++;
}
++sparse_idx;
}
TensorVector infer_in;
infer_in.insert(infer_in.end(), dense_out.begin(), dense_out.end());
infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end());
output_blob->SetBatchSize(batch_size);
output_blob->SetLogId(log_id);
VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
int64_t start = timeline.TimeStampUS();
if (InferManager::instance().infer(
engine_name().c_str(), &infer_in, out, batch_size)) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed do infer in fluid model: " << engine_name();
return -1;
}
int64_t end = timeline.TimeStampUS();
CopyBlobInfo(input_blob, output_blob);
AddBlobInfo(output_blob, cube_start);
AddBlobInfo(output_blob, cube_end);
AddBlobInfo(output_blob, start);
AddBlobInfo(output_blob, end);
return 0;
}
int GeneralDistKVInferOp::inference() { return 0; }
DEFINE_OP(GeneralDistKVInferOp);
} // namespace serving
......
......@@ -188,21 +188,6 @@ int GeneralDistKVQuantInferOp::inference() {
VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
Timer timeline;
int64_t start = timeline.TimeStampUS();
timeline.Start();
if (InferManager::instance().infer(
engine_name().c_str(), &infer_in, out, batch_size)) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed do infer in fluid model: " << engine_name();
return -1;
}
int64_t end = timeline.TimeStampUS();
CopyBlobInfo(input_blob, output_blob);
AddBlobInfo(output_blob, start);
AddBlobInfo(output_blob, end);
return 0;
}
DEFINE_OP(GeneralDistKVQuantInferOp);
......
......@@ -44,45 +44,9 @@ int GeneralInferOp::inference() {
<< pre_node_names.size();
return -1;
}
const std::string pre_name = pre_node_names[0];
const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
uint64_t log_id = input_blob->GetLogId();
VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
GeneralBlob *output_blob = mutable_data<GeneralBlob>();
output_blob->SetLogId(log_id);
if (!input_blob) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed mutable depended argument, op:" << pre_name;
if (InferManager::instance().infer(engine_name().c_str())) {
return -1;
}
const TensorVector *in = &input_blob->tensor_vector;
TensorVector *out = &output_blob->tensor_vector;
int batch_size = input_blob->_batch_size;
VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
output_blob->_batch_size = batch_size;
VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
Timer timeline;
int64_t start = timeline.TimeStampUS();
timeline.Start();
if (InferManager::instance().infer(
engine_name().c_str(), in, out, batch_size)) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed do infer in fluid model: " << engine_name().c_str();
return -1;
}
int64_t end = timeline.TimeStampUS();
CopyBlobInfo(input_blob, output_blob);
AddBlobInfo(output_blob, start);
AddBlobInfo(output_blob, end);
return 0;
}
DEFINE_OP(GeneralInferOp);
......
......@@ -20,6 +20,7 @@
#include "core/general-server/op/general_infer_helper.h"
#include "core/predictor/framework/infer.h"
#include "core/predictor/framework/memory.h"
#include "core/predictor/framework/resource.h"
#include "core/util/include/timer.h"
namespace baidu {
......@@ -32,6 +33,7 @@ using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FeedInst;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
using baidu::paddle_serving::predictor::InferManager;
int conf_check(const Request *req,
const std::shared_ptr<PaddleGeneralModelConfig> &model_config) {
......@@ -71,75 +73,34 @@ int conf_check(const Request *req,
int GeneralReaderOp::inference() {
// reade request from client
// TODO: only support one engine here
std::string engine_name = "general_infer_0";
const Request *req = dynamic_cast<const Request *>(get_request_message());
uint64_t log_id = req->log_id();
int input_var_num = 0;
std::vector<int64_t> elem_type;
std::vector<int64_t> elem_size;
std::vector<int64_t> capacity;
GeneralBlob *res = mutable_data<GeneralBlob>();
TensorVector *out = &res->tensor_vector;
res->SetLogId(log_id);
if (!res) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed get op tls reader object output";
}
Timer timeline;
int64_t start = timeline.TimeStampUS();
int var_num = req->insts(0).tensor_array_size();
VLOG(2) << "(logid=" << log_id << ") var num: " << var_num;
VLOG(2) << "(logid=" << log_id
<< ") start to call load general model_conf op";
baidu::paddle_serving::predictor::Resource &resource =
baidu::paddle_serving::predictor::Resource::instance();
VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config();
VLOG(2) << "(logid=" << log_id << ") print general model config done.";
// TODO(guru4elephant): how to do conditional check?
/*
int ret = conf_check(req, model_config);
if (ret != 0) {
LOG(ERROR) << "model conf of server:";
resource.print_general_model_config(model_config);
return 0;
}
*/
// package tensor
elem_type.resize(var_num);
elem_size.resize(var_num);
capacity.resize(var_num);
// prepare basic information for input
for (int i = 0; i < var_num; ++i) {
paddle::PaddleTensor lod_tensor;
elem_type[i] = req->insts(0).tensor_array(i).elem_type();
VLOG(2) << "var[" << i << "] has elem type: " << elem_type[i];
if (elem_type[i] == 0) { // int64
elem_size[i] = sizeof(int64_t);
lod_tensor.dtype = paddle::PaddleDType::INT64;
} else if (elem_type[i] == 1) {
elem_size[i] = sizeof(float);
lod_tensor.dtype = paddle::PaddleDType::FLOAT32;
} else if (elem_type[i] == 2) {
elem_size[i] = sizeof(int32_t);
lod_tensor.dtype = paddle::PaddleDType::INT32;
}
// implement lod tensor here
std::string tensor_name = model_config->_feed_name[i];
VLOG(2) << "(logid=" << log_id << ") get tensor name: " << tensor_name;
auto lod_tensor = InferManager::instance().GetInputHandle(
engine_name.c_str(), tensor_name.c_str());
std::vector<std::vector<size_t>> lod;
std::vector<int> shape;
// get lod info here
if (req->insts(0).tensor_array(i).lod_size() > 0) {
VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
lod_tensor.lod.resize(1);
lod.resize(1);
for (int k = 0; k < req->insts(0).tensor_array(i).lod_size(); ++k) {
lod_tensor.lod[0].push_back(req->insts(0).tensor_array(i).lod(k));
lod[0].push_back(req->insts(0).tensor_array(i).lod(k));
}
capacity[i] = 1;
for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
......@@ -147,7 +108,7 @@ int GeneralReaderOp::inference() {
VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
<< "]: " << dim;
capacity[i] *= dim;
lod_tensor.shape.push_back(dim);
shape.push_back(dim);
}
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] is tensor, capacity: " << capacity[i];
......@@ -158,92 +119,41 @@ int GeneralReaderOp::inference() {
VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
<< "]: " << dim;
capacity[i] *= dim;
lod_tensor.shape.push_back(dim);
shape.push_back(dim);
}
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] is tensor, capacity: " << capacity[i];
}
lod_tensor.name = model_config->_feed_name[i];
out->push_back(lod_tensor);
}
// specify the memory needed for output tensor_vector
for (int i = 0; i < var_num; ++i) {
if (out->at(i).lod.size() == 1) {
int tensor_size = 0;
const Tensor &tensor = req->insts(0).tensor_array(i);
int data_len = 0;
if (tensor.int64_data_size() > 0) {
data_len = tensor.int64_data_size();
} else if (tensor.float_data_size() > 0) {
data_len = tensor.float_data_size();
} else if (tensor.int_data_size() > 0) {
data_len = tensor.int_data_size();
}
VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
<< "]: " << data_len;
tensor_size += data_len;
int cur_len = out->at(i).lod[0].back();
VLOG(2) << "(logid=" << log_id << ") current len: " << cur_len;
int sample_len = 0;
if (tensor.shape_size() == 1) {
sample_len = data_len;
} else {
sample_len = tensor.shape(0);
}
VLOG(2) << "(logid=" << log_id << ") new len: " << cur_len + sample_len;
out->at(i).data.Resize(tensor_size * elem_size[i]);
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] is lod_tensor and len=" << out->at(i).lod[0].back();
} else {
out->at(i).data.Resize(capacity[i] * elem_size[i]);
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] is tensor and capacity=" << capacity[i];
}
}
// fill the data into output general_blob
for (int i = 0; i < var_num; ++i) {
if (elem_type[i] == 0) {
int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
<< "] is " << req->insts(0).tensor_array(i).int64_data(0);
int offset = 0;
lod_tensor->SetLoD(lod);
lod_tensor->Reshape(shape);
// insert data here
if (req->insts(0).tensor_array(i).elem_type() == 0) {
// TODO: Copy twice here, can optimize
int elem_num = req->insts(0).tensor_array(i).int64_data_size();
std::vector<int64_t> data(elem_num);
int64_t *dst_ptr = data.data();
for (int k = 0; k < elem_num; ++k) {
dst_ptr[offset + k] = req->insts(0).tensor_array(i).int64_data(k);
dst_ptr[k] = req->insts(0).tensor_array(i).int64_data(k);
}
} else if (elem_type[i] == 1) {
float *dst_ptr = static_cast<float *>(out->at(i).data.data());
VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
<< "] is " << req->insts(0).tensor_array(i).float_data(0);
int offset = 0;
lod_tensor->CopyFromCpu(dst_ptr);
} else if (req->insts(0).tensor_array(i).elem_type() == 1) {
int elem_num = req->insts(0).tensor_array(i).float_data_size();
std::vector<float> data(elem_num);
float *dst_ptr = data.data();
for (int k = 0; k < elem_num; ++k) {
dst_ptr[offset + k] = req->insts(0).tensor_array(i).float_data(k);
dst_ptr[k] = req->insts(0).tensor_array(i).float_data(k);
}
} else if (elem_type[i] == 2) {
int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
<< "] is " << req->insts(0).tensor_array(i).int_data(0);
int offset = 0;
lod_tensor->CopyFromCpu(dst_ptr);
} else if (req->insts(0).tensor_array(i).elem_type() == 2) {
int elem_num = req->insts(0).tensor_array(i).int_data_size();
std::vector<int32_t> data(elem_num);
int32_t *dst_ptr = data.data();
for (int k = 0; k < elem_num; ++k) {
dst_ptr[offset + k] = req->insts(0).tensor_array(i).int_data(k);
dst_ptr[k] = req->insts(0).tensor_array(i).int_data(k);
}
lod_tensor->CopyFromCpu(dst_ptr);
}
}
VLOG(2) << "(logid=" << log_id << ") output size: " << out->size();
timeline.Pause();
int64_t end = timeline.TimeStampUS();
res->p_size = 0;
res->_batch_size = 1;
AddBlobInfo(res, start);
AddBlobInfo(res, end);
VLOG(2) << "(logid=" << log_id << ") read data from client success";
return 0;
}
DEFINE_OP(GeneralReaderOp);
......
......@@ -40,160 +40,60 @@ using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
int GeneralResponseOp::inference() {
const std::vector<std::string> pre_node_names = pre_names();
VLOG(2) << "pre node names size: " << pre_node_names.size();
const GeneralBlob *input_blob;
uint64_t log_id =
get_depend_argument<GeneralBlob>(pre_node_names[0])->GetLogId();
const Request *req = dynamic_cast<const Request *>(get_request_message());
// response inst with only fetch_var_names
Response *res = mutable_data<Response>();
Timer timeline;
// double response_time = 0.0;
// timeline.Start();
int64_t start = timeline.TimeStampUS();
VLOG(2) << "(logid=" << log_id
<< ") start to call load general model_conf op";
baidu::paddle_serving::predictor::Resource &resource =
baidu::paddle_serving::predictor::Resource::instance();
VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config();
VLOG(2) << "(logid=" << log_id
<< ") max body size : " << brpc::fLU64::FLAGS_max_body_size;
std::vector<int> fetch_index;
fetch_index.resize(req->fetch_var_names_size());
for (int i = 0; i < req->fetch_var_names_size(); ++i) {
fetch_index[i] =
model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
}
for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
const std::string &pre_name = pre_node_names[pi];
VLOG(2) << "(logid=" << log_id << ") pre names[" << pi << "]: " << pre_name
<< " (" << pre_node_names.size() << ")";
input_blob = get_depend_argument<GeneralBlob>(pre_name);
// fprintf(stderr, "input(%s) blob address %x\n", pre_names.c_str(),
// input_blob);
if (!input_blob) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed mutable depended argument, op: " << pre_name;
return -1;
}
const TensorVector *in = &input_blob->tensor_vector;
std::vector<int> capacity(req->fetch_var_names_size(), 1);
std::string engine_name = "general_infer_0";
ModelOutput *output = res->add_outputs();
// To get the order of model return values
output->set_engine_name(pre_name);
FetchInst *fetch_inst = output->add_insts();
for (auto &idx : fetch_index) {
Tensor *tensor = fetch_inst->add_tensor_array();
if (model_config->_is_lod_fetch[idx]) {
VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
<< model_config->_fetch_name[idx] << " is lod_tensor";
for (int k = 0; k < in->at(idx).shape.size(); ++k) {
VLOG(2) << "(logid=" << log_id << ") shape[" << k
<< "]: " << in->at(idx).shape[k];
tensor->add_shape(in->at(idx).shape[k]);
}
} else {
VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
<< model_config->_fetch_name[idx] << " is tensor";
for (int k = 0; k < in->at(idx).shape.size(); ++k) {
VLOG(2) << "(logid=" << log_id << ") shape[" << k
<< "]: " << in->at(idx).shape[k];
tensor->add_shape(in->at(idx).shape[k]);
}
}
}
int var_idx = 0;
for (auto &idx : fetch_index) {
int cap = 1;
for (int j = 0; j < in->at(idx).shape.size(); ++j) {
cap *= in->at(idx).shape[j];
}
FetchInst *fetch_p = output->mutable_insts(0);
auto dtype = in->at(idx).dtype;
std::vector<std::string> outs =
InferManager::instance().GetOutputNames(engine_name.c_str());
for (int i = 0; i < req->fetch_var_names_size(); ++i) {
Tensor *tensor = fetch_inst->add_tensor_array();
std::string tensor_name = outs[i];
auto lod_tensor = InferManager::instance().GetOutputHandle(
engine_name.c_str(), tensor_name.c_str());
std::vector<int> shape = lod_tensor->shape();
for (int k = 0; k < shape.size(); ++k) {
capacity[i] *= shape[k];
tensor->add_shape(shape[k]);
}
auto dtype = lod_tensor->type();
if (dtype == paddle::PaddleDType::INT64) {
VLOG(2) << "(logid=" << log_id << ") Prepare int64 var ["
<< model_config->_fetch_name[idx] << "].";
int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data());
// from
// https://stackoverflow.com/questions/15499641/copy-a-stdvector-to-a-repeated-field-from-protobuf-with-memcpy
// `Swap` method is faster than `{}` method.
std::vector<int64_t> datas(capacity[i]);
int64_t *data_ptr = datas.data();
lod_tensor->CopyToCpu(data_ptr);
google::protobuf::RepeatedField<int64_t> tmp_data(data_ptr,
data_ptr + cap);
fetch_p->mutable_tensor_array(var_idx)->mutable_int64_data()->Swap(
&tmp_data);
data_ptr + capacity[i]);
tensor->mutable_int64_data()->Swap(&tmp_data);
} else if (dtype == paddle::PaddleDType::FLOAT32) {
VLOG(2) << "(logid=" << log_id << ") Prepare float var ["
<< model_config->_fetch_name[idx] << "].";
float *data_ptr = static_cast<float *>(in->at(idx).data.data());
std::vector<float> datas(capacity[i]);
float *data_ptr = datas.data();
lod_tensor->CopyToCpu(data_ptr);
google::protobuf::RepeatedField<float> tmp_data(data_ptr,
data_ptr + cap);
fetch_p->mutable_tensor_array(var_idx)->mutable_float_data()->Swap(
&tmp_data);
data_ptr + capacity[i]);
tensor->mutable_float_data()->Swap(&tmp_data);
} else if (dtype == paddle::PaddleDType::INT32) {
VLOG(2) << "(logid=" << log_id << ")Prepare int32 var ["
<< model_config->_fetch_name[idx] << "].";
int32_t *data_ptr = static_cast<int32_t *>(in->at(idx).data.data());
std::vector<int32_t> datas(capacity[i]);
int32_t *data_ptr = datas.data();
lod_tensor->CopyToCpu(data_ptr);
google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
data_ptr + cap);
fetch_p->mutable_tensor_array(var_idx)->mutable_int_data()->Swap(
&tmp_data);
data_ptr + capacity[i]);
tensor->mutable_int_data()->Swap(&tmp_data);
}
if (model_config->_is_lod_fetch[idx]) {
if (in->at(idx).lod.size() > 0) {
for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
fetch_p->mutable_tensor_array(var_idx)->add_lod(
in->at(idx).lod[0][j]);
std::vector<std::vector<size_t>> lod = lod_tensor->lod();
if (lod.size() > 0) {
for (int j = 0; j < lod[0].size(); ++j) {
tensor->add_lod(lod[0][j]);
}
}
}
VLOG(2) << "(logid=" << log_id << ") fetch var ["
<< model_config->_fetch_name[idx] << "] ready";
var_idx++;
}
}
if (req->profile_server()) {
int64_t end = timeline.TimeStampUS();
// TODO(barriery): multi-model profile_time.
// At present, only the response_op is multi-input, so here we get
// the profile_time by hard coding. It needs to be replaced with
// a more elegant way.
for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
input_blob = get_depend_argument<GeneralBlob>(pre_node_names[pi]);
VLOG(2) << "(logid=" << log_id
<< ") p size for input blob: " << input_blob->p_size;
int profile_time_idx = -1;
if (pi == 0) {
profile_time_idx = 0;
} else {
profile_time_idx = input_blob->p_size - 2;
}
for (; profile_time_idx < input_blob->p_size; ++profile_time_idx) {
res->add_profile_time(input_blob->time_stamp[profile_time_idx]);
}
}
// TODO(guru4elephant): find more elegant way to do this
res->add_profile_time(start);
res->add_profile_time(end);
}
return 0;
}
......
......@@ -12,13 +12,12 @@ set_source_files_properties(
${pdserving_srcs}
PROPERTIES
COMPILE_FLAGS "-Wno-strict-aliasing -Wno-unused-variable -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure)
add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure extern_paddle paddle_fluid)
if (WITH_TRT)
add_definitions(-DWITH_TRT)
endif()
target_link_libraries(pdserving
brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz paddle_fluid ${paddle_depend_libs})
# install
install(TARGETS pdserving
RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/bin
......
......@@ -20,10 +20,9 @@
#include <utility>
#include <vector>
#include "core/predictor/common/inner_common.h"
#include "core/predictor/framework/bsf.h"
#include "core/predictor/framework/factory.h"
#include "core/predictor/framework/infer_data.h"
#include "paddle_inference_api.h" // NOLINT
namespace baidu {
namespace paddle_serving {
namespace predictor {
......@@ -39,6 +38,8 @@ class InferEngineCreationParams {
_static_optimization = false;
_force_update_static_cache = false;
_use_trt = false;
_use_lite = false;
_use_xpu = false;
}
void set_path(const std::string& path) { _path = path; }
......@@ -53,6 +54,10 @@ class InferEngineCreationParams {
void set_use_trt(bool use_trt) { _use_trt = use_trt; }
void set_use_lite(bool use_lite) { _use_lite = use_lite; }
void set_use_xpu(bool use_xpu) { _use_xpu = use_xpu; }
bool enable_memory_optimization() const {
return _enable_memory_optimization;
}
......@@ -61,6 +66,10 @@ class InferEngineCreationParams {
bool use_trt() const { return _use_trt; }
bool use_lite() const { return _use_lite; }
bool use_xpu() const { return _use_xpu; }
void set_static_optimization(bool static_optimization = false) {
_static_optimization = static_optimization;
}
......@@ -80,6 +89,9 @@ class InferEngineCreationParams {
<< "model_path = " << _path << ", "
<< "enable_memory_optimization = " << _enable_memory_optimization
<< ", "
<< "enable_tensorrt = " << _use_trt << ", "
<< "enable_lite = " << _use_lite << ", "
<< "enable_xpu = " << _use_xpu << ", "
<< "enable_ir_optimization = " << _enable_ir_optimization << ", "
<< "static_optimization = " << _static_optimization << ", "
<< "force_update_static_cache = " << _force_update_static_cache;
......@@ -92,6 +104,8 @@ class InferEngineCreationParams {
bool _static_optimization;
bool _force_update_static_cache;
bool _use_trt;
bool _use_lite;
bool _use_xpu;
};
class InferEngine {
......@@ -105,9 +119,7 @@ class InferEngine {
virtual int thrd_initialize() { return thrd_initialize_impl(); }
virtual int thrd_clear() { return thrd_clear_impl(); }
virtual int thrd_finalize() { return thrd_finalize_impl(); }
virtual int infer(const void* in, void* out, uint32_t batch_size = -1) {
return infer_impl1(in, out, batch_size);
}
virtual int infer() { return infer_impl(); }
virtual int reload() = 0;
......@@ -120,11 +132,13 @@ class InferEngine {
virtual int thrd_finalize_impl() = 0;
virtual int thrd_clear_impl() = 0;
virtual int proc_finalize_impl() = 0;
virtual int infer_impl1(const void* in,
void* out,
uint32_t batch_size = -1) = 0;
virtual int infer_impl2(const BatchTensor& in,
BatchTensor& out) = 0; // NOLINT
virtual std::vector<std::string> GetInputNames() = 0;
virtual std::vector<std::string> GetOutputNames() = 0;
virtual std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
const std::string& name) = 0;
virtual std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
const std::string& name) = 0;
virtual int infer_impl() = 0;
// end: framework inner call
};
......@@ -138,8 +152,6 @@ class ReloadableInferEngine : public InferEngine {
uint64_t last_revision;
};
typedef im::bsf::Task<Tensor, Tensor> TaskT;
virtual int load(const InferEngineCreationParams& params) = 0;
int proc_initialize_impl(const configure::EngineDesc& conf, bool version) {
......@@ -182,6 +194,14 @@ class ReloadableInferEngine : public InferEngine {
_infer_engine_params.set_use_trt(conf.use_trt());
}
if (conf.has_use_lite()) {
_infer_engine_params.set_use_lite(conf.use_lite());
}
if (conf.has_use_xpu()) {
_infer_engine_params.set_use_xpu(conf.use_xpu());
}
if (!check_need_reload() || load(_infer_engine_params) != 0) {
LOG(ERROR) << "Failed load model_data_path" << _model_data_path;
return -1;
......@@ -201,45 +221,10 @@ class ReloadableInferEngine : public InferEngine {
LOG(ERROR) << "Failed proc initialize impl";
return -1;
}
// init bsf framework
if (_infer_thread_num <= 0) {
return 0;
}
im::bsf::TaskExecutor<TaskT>::instance()->set_thread_init_fn(
boost::bind(&InferEngine::thrd_initialize_impl, this));
im::bsf::TaskExecutor<TaskT>::instance()->set_thread_reset_fn(
boost::bind(&InferEngine::thrd_clear_impl, this));
im::bsf::TaskExecutor<TaskT>::instance()->set_thread_callback_fn(
boost::bind(&InferEngine::infer_impl2, this, _1, _2));
im::bsf::TaskExecutor<TaskT>::instance()->set_batch_size(_infer_batch_size);
im::bsf::TaskExecutor<TaskT>::instance()->set_batch_align(
_infer_batch_align);
if (im::bsf::TaskExecutor<TaskT>::instance()->start(_infer_thread_num) !=
0) {
LOG(ERROR) << "Failed start bsf executor, threads:" << _infer_thread_num;
return -1;
}
LOG(WARNING) << "Enable batch schedule framework, thread_num:"
<< _infer_thread_num << ", batch_size:" << _infer_batch_size
<< ", enable_batch_align:" << _infer_batch_align;
return 0;
}
int infer(const void* in, void* out, uint32_t batch_size = -1) {
if (_infer_thread_num <= 0) {
return infer_impl1(in, out, batch_size);
}
im::bsf::TaskManager<Tensor, Tensor> task_manager;
task_manager.schedule(*(reinterpret_cast<const BatchTensor*>(in)),
*(reinterpret_cast<BatchTensor*>(out)));
task_manager.wait();
return 0;
}
int infer() { return infer_impl(); }
int thrd_initialize() {
if (_infer_thread_num > 0) {
......@@ -263,10 +248,6 @@ class ReloadableInferEngine : public InferEngine {
return -1;
}
if (_infer_thread_num > 0) {
im::bsf::TaskExecutor<TaskT>::instance()->stop();
}
return 0;
}
......@@ -417,10 +398,6 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
virtual int thrd_initialize_impl() {
// memory pool to be inited in non-serving-threads
if (MempoolWrapper::instance().thread_initialize() != 0) {
LOG(ERROR) << "Failed thread initialize mempool";
return -1;
}
ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
if (!md || load_data(md, _infer_engine_params) != 0) {
......@@ -430,17 +407,12 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
}
THREAD_SETSPECIFIC(_skey, md);
im::bsf::AutoMutex lock(_mutex);
_reload_vec.push_back(md);
return 0;
}
int thrd_clear_impl() {
// for non-serving-threads
if (MempoolWrapper::instance().thread_clear() != 0) {
LOG(ERROR) << "Failed thread clear mempool";
return -1;
}
return 0;
}
......@@ -538,12 +510,6 @@ class CloneDBReloadableInferEngine
}
virtual int thrd_initialize_impl() {
// memory pool to be inited in non-serving-threads
if (MempoolWrapper::instance().thread_initialize() != 0) {
LOG(ERROR) << "Failed thread initialize mempool";
return -1;
}
ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
if (!md || load_data(md, _pd->cores[_pd->current_idx]) != 0) {
LOG(ERROR) << "Failed clone thread data, origin_core["
......@@ -552,7 +518,6 @@ class CloneDBReloadableInferEngine
}
THREAD_SETSPECIFIC(DBReloadableInferEngine<EngineCore>::_skey, md);
im::bsf::AutoMutex lock(DBReloadableInferEngine<EngineCore>::_mutex);
DBReloadableInferEngine<EngineCore>::_reload_vec.push_back(md);
return 0;
}
......@@ -571,8 +536,45 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
public: // NOLINT
FluidInferEngine() {}
~FluidInferEngine() {}
std::vector<std::string> GetInputNames() {
FluidFamilyCore* core =
DBReloadableInferEngine<FluidFamilyCore>::get_core();
if (!core || !core->get()) {
LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
}
return core->GetInputNames();
}
std::vector<std::string> GetOutputNames() {
FluidFamilyCore* core =
DBReloadableInferEngine<FluidFamilyCore>::get_core();
if (!core || !core->get()) {
LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
}
return core->GetOutputNames();
}
std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
const std::string& name) {
FluidFamilyCore* core =
DBReloadableInferEngine<FluidFamilyCore>::get_core();
if (!core || !core->get()) {
LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
}
return core->GetInputHandle(name);
}
std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
const std::string& name) {
FluidFamilyCore* core =
DBReloadableInferEngine<FluidFamilyCore>::get_core();
if (!core || !core->get()) {
LOG(ERROR) << "Failed get fluid core in GetOutputHandle()";
}
return core->GetOutputHandle(name);
}
int infer_impl1(const void* in, void* out, uint32_t batch_size = -1) {
int infer_impl() {
FluidFamilyCore* core =
DBReloadableInferEngine<FluidFamilyCore>::get_core();
if (!core || !core->get()) {
......@@ -580,16 +582,12 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
return -1;
}
if (!core->Run(in, out)) {
if (!core->Run()) {
LOG(ERROR) << "Failed run fluid family core";
return -1;
}
return 0;
}
int infer_impl2(const BatchTensor& in, BatchTensor& out) { // NOLINT
return infer_impl1(&in, &out);
}
};
typedef FactoryPool<InferEngine> StaticInferFactory;
......@@ -715,13 +713,45 @@ class VersionedInferEngine : public InferEngine {
return _versions.begin()->second;
}
int infer(const void* in, void* out, uint32_t batch_size) {
int infer() {
InferEngine* engine = default_engine();
if (!engine) {
LOG(WARNING) << "fail to get default engine";
return -1;
}
return engine->infer(in, out, batch_size);
return engine->infer();
}
std::vector<std::string> GetInputNames() {
InferEngine* engine = default_engine();
if (!engine) {
LOG(WARNING) << "fail to get default engine";
}
return engine->GetInputNames();
}
std::vector<std::string> GetOutputNames() {
InferEngine* engine = default_engine();
if (!engine) {
LOG(WARNING) << "fail to get default engine";
}
return engine->GetOutputNames();
}
std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
const std::string& name) {
InferEngine* engine = default_engine();
if (!engine) {
LOG(WARNING) << "fail to get default engine";
}
return engine->GetInputHandle(name);
}
std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
const std::string& name) {
InferEngine* engine = default_engine();
if (!engine) {
LOG(WARNING) << "fail to get default engine";
}
return engine->GetOutputHandle(name);
}
template <typename T>
......@@ -740,14 +770,47 @@ class VersionedInferEngine : public InferEngine {
}
// versioned inference interface
int infer(const void* in, void* out, uint32_t batch_size, uint64_t version) {
int infer(uint64_t version) {
auto iter = _versions.find(version);
if (iter == _versions.end()) {
LOG(ERROR) << "Not found version engine: " << version;
return -1;
}
return iter->second->infer(in, out, batch_size);
return iter->second->infer();
}
std::vector<std::string> GetInputNames(uint64_t version) {
auto iter = _versions.find(version);
if (iter == _versions.end()) {
LOG(ERROR) << "Not found version engine: " << version;
}
return iter->second->GetInputNames();
}
std::vector<std::string> GetOutputNames(uint64_t version) {
auto iter = _versions.find(version);
if (iter == _versions.end()) {
LOG(ERROR) << "Not found version engine: " << version;
}
return iter->second->GetOutputNames();
}
std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
uint64_t version, const std::string& name) {
auto iter = _versions.find(version);
if (iter == _versions.end()) {
LOG(ERROR) << "Not found version engine: " << version;
}
return iter->second->GetInputHandle(name);
}
std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
uint64_t version, const std::string& name) {
auto iter = _versions.find(version);
if (iter == _versions.end()) {
LOG(ERROR) << "Not found version engine: " << version;
}
return iter->second->GetOutputHandle(name);
}
template <typename T>
......@@ -774,12 +837,7 @@ class VersionedInferEngine : public InferEngine {
int thrd_finalize_impl() { return -1; }
int thrd_clear_impl() { return -1; }
int proc_finalize_impl() { return -1; }
int infer_impl1(const void* in, void* out, uint32_t batch_size = -1) {
return -1;
}
int infer_impl2(const BatchTensor& in, BatchTensor& out) { // NOLINT
return -1;
} // NOLINT
int infer_impl() { return -1; }
private:
boost::unordered_map<uint64_t, InferEngine*> _versions;
......@@ -877,16 +935,44 @@ class InferManager {
}
// Inference interface
int infer(const char* model_name,
const void* in,
void* out,
uint32_t batch_size = -1) {
int infer(const char* model_name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
return -1;
}
return it->second->infer(in, out, batch_size);
return it->second->infer();
}
std::vector<std::string> GetInputNames(const char* model_name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetInputNames();
}
std::vector<std::string> GetOutputNames(const char* model_name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetOutputNames();
}
std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
const char* model_name, const std::string& name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetInputHandle(name);
}
std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
const char* model_name, const std::string& name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetOutputHandle(name);
}
template <typename T>
......@@ -906,19 +992,48 @@ class InferManager {
}
// Versioned inference interface
int infer(const char* model_name,
const void* in,
void* out,
uint32_t batch_size,
uint64_t version) {
int infer(const char* model_name, uint64_t version) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
return -1;
}
return it->second->infer(in, out, batch_size, version);
return it->second->infer(version);
}
std::vector<std::string> GetInputNames(const char* model_name,
uint64_t version) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetInputNames(version);
}
std::vector<std::string> GetOutputNames(const char* model_name,
uint64_t version) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetOutputNames(version);
}
std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
const char* model_name, uint64_t version, const std::string& name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetInputHandle(version, name);
}
std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
const char* model_name, uint64_t version, const std::string& name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetOutputHandle(version, name);
}
template <typename T>
T* get_core(const char* model_name, uint64_t version) {
auto it = _map.find(model_name);
......
......@@ -56,21 +56,25 @@ the script of client side bert_client.py is as follow:
[//file]:#bert_client.py
``` python
import os
import sys
from paddle_serving_client import Client
from paddle_serving_client.utils import benchmark_args
from paddle_serving_app.reader import ChineseBertReader
import numpy as np
args = benchmark_args()
reader = ChineseBertReader()
reader = ChineseBertReader({"max_seq_len": 128})
fetch = ["pooled_output"]
endpoint_list = ["127.0.0.1:9292"]
endpoint_list = ['127.0.0.1:9292']
client = Client()
client.load_client_config("bert_seq20_client/serving_client_conf.prototxt")
client.load_client_config(args.model)
client.connect(endpoint_list)
for line in sys.stdin:
feed_dict = reader.process(line)
result = client.predict(feed=feed_dict, fetch=fetch)
for key in feed_dict.keys():
feed_dict[key] = np.array(feed_dict[key]).reshape((128, 1))
result = client.predict(feed=feed_dict, fetch=fetch, batch=False)
```
run
......
......@@ -52,18 +52,23 @@ pip install paddle_serving_app
``` python
import sys
from paddle_serving_client import Client
from paddle_serving_client.utils import benchmark_args
from paddle_serving_app.reader import ChineseBertReader
import numpy as np
args = benchmark_args()
reader = ChineseBertReader()
reader = ChineseBertReader({"max_seq_len": 128})
fetch = ["pooled_output"]
endpoint_list = ["127.0.0.1:9292"]
endpoint_list = ['127.0.0.1:9292']
client = Client()
client.load_client_config("bert_seq20_client/serving_client_conf.prototxt")
client.load_client_config(args.model)
client.connect(endpoint_list)
for line in sys.stdin:
feed_dict = reader.process(line)
result = client.predict(feed=feed_dict, fetch=fetch)
for key in feed_dict.keys():
feed_dict[key] = np.array(feed_dict[key]).reshape((128, 1))
result = client.predict(feed=feed_dict, fetch=fetch, batch=False)
```
执行
......
......@@ -122,6 +122,7 @@ make -j10
export CUDA_PATH='/usr/local'
export CUDNN_LIBRARY='/usr/local/cuda/lib64/'
export CUDA_CUDART_LIBRARY="/usr/local/cuda/lib64/"
export TENSORRT_LIBRARY_PATH="/usr/local/TensorRT-6.0.1.5/targets/x86_64-linux-gnu/"
mkdir server-build-trt && cd server-build-trt
cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
......
......@@ -676,7 +676,7 @@ service_throughput = 1 / 最慢OP的耗时 * 并发数
service_avg_cost = ∑op_concurrency 【关键路径】
Channel堆积:
channel_acc_size = QPS(down - up) * time
channel_acc_size = QPS(down - up) * time
批量预测平均耗时:
avg_batch_cost = (N * pre + mid + post) / N
......
......@@ -32,63 +32,9 @@ The `-p` option is to map the `9292` port of the container to the `9292` port of
### Install PaddleServing
In order to make the image smaller, the PaddleServing package is not installed in the image. You can run the following command to install it:
```bash
pip install paddle-serving-server
```
You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source of the following example) to speed up the download:
```shell
pip install paddle-serving-server -i https://pypi.tuna.tsinghua.edu.cn/simple
```
### Test example
Get the trained Boston house price prediction model by the following command:
```bash
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
```
- Test HTTP service
Running on the Server side (inside the container):
```bash
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci >std.log 2>err.log &
```
Running on the Client side (inside or outside the container):
```bash
curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
```
- Test RPC service
Running on the Server side (inside the container):
```bash
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 >std.log 2>err.log &
```
Running following Python code on the Client side (inside or outside the container, The `paddle-serving-client` package needs to be installed):
```bash
from paddle_serving_client import Client
client = Client()
client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
client.connect(["127.0.0.1:9292"])
data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
fetch_map = client.predict(feed={"x": data}, fetch=["price"])
print(fetch_map)
```
The mirror comes with `paddle_serving_server`, `paddle_serving_client`, and `paddle_serving_app` corresponding to the mirror tag version. If users don’t need to change the version, they can use it directly, which is suitable for environments without extranet services.
If you need to change the version, please refer to the instructions on the homepage to download the pip package of the corresponding version.
## GPU
......@@ -100,7 +46,7 @@ The GPU version is basically the same as the CPU version, with only some differe
Refer to [this document](DOCKER_IMAGES.md) for a docker image, the following is an example of an `cuda9.0-cudnn7` image:
```shell
nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
```
### Create container
......@@ -110,77 +56,21 @@ nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/se
nvidia-docker exec -it test bash
```
The `-p` option is to map the `9292` port of the container to the `9292` port of the host.
### Install PaddleServing
In order to make the image smaller, the PaddleServing package is not installed in the image. You can run the following command to install it:
or
```bash
pip install paddle-serving-server-gpu
```
You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source of the following example) to speed up the download:
```shell
pip install paddle-serving-server-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple
```
### Test example
When running the GPU Server, you need to set the GPUs used by the prediction service through the `--gpu_ids` option, and the CPU is used by default. An error will be reported when the value of `--gpu_ids` exceeds the environment variable `CUDA_VISIBLE_DEVICES`. The following example specifies to use a GPU with index 0:
```shell
export CUDA_VISIBLE_DEVICES=0,1
python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9292 --gpu_ids 0
```
Get the trained Boston house price prediction model by the following command:
```bash
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
docker run --gpus all -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
docker exec -it test bash
```
- Test HTTP service
Running on the Server side (inside the container):
```bash
python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --name uci --gpu_ids 0
```
Running on the Client side (inside or outside the container):
```bash
curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
```
- Test RPC service
Running on the Server side (inside the container):
```bash
python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0
```
Running following Python code on the Client side (inside or outside the container, The `paddle-serving-client` package needs to be installed):
```bash
from paddle_serving_client import Client
client = Client()
client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
client.connect(["127.0.0.1:9292"])
data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
fetch_map = client.predict(feed={"x": data}, fetch=["price"])
print(fetch_map)
```
The `-p` option is to map the `9292` port of the container to the `9292` port of the host.
### Install PaddleServing
The mirror comes with `paddle_serving_server_gpu`, `paddle_serving_client`, and `paddle_serving_app` corresponding to the mirror tag version. If users don’t need to change the version, they can use it directly, which is suitable for environments without extranet services.
If you need to change the version, please refer to the instructions on the homepage to download the pip package of the corresponding version.
## Attention
## Precautious
Runtime images cannot be used for compilation. If you want to compile from source, refer to [COMPILE](COMPILE.md).
......@@ -20,7 +20,6 @@ Docker(GPU版本需要在GPU机器上安装nvidia-docker)
docker pull hub.baidubce.com/paddlepaddle/serving:latest
```
### 创建容器并进入
```bash
......@@ -32,74 +31,11 @@ docker exec -it test bash
### 安装PaddleServing
为了减小镜像的体积,镜像中没有安装Serving包,要执行下面命令进行安装。
```bash
pip install paddle-serving-server
```
您可能需要使用国内镜像源(例如清华源)来加速下载。
```shell
pip install paddle-serving-server -i https://pypi.tuna.tsinghua.edu.cn/simple
```
### 测试example
通过下面命令获取训练好的Boston房价预估模型:
```bash
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
```
- 测试HTTP服务
在Server端(容器内)运行:
镜像里自带对应镜像tag版本的`paddle_serving_server``paddle_serving_client``paddle_serving_app`,如果用户不需要更改版本,可以直接使用,适用于没有外网服务的环境。
```bash
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci >std.log 2>err.log &
```
如果需要更换版本,请参照首页的指导,下载对应版本的pip包。
在Client端(容器内或容器外)运行:
```bash
curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
```
- 测试RPC服务
在Server端(容器内)运行:
```bash
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 >std.log 2>err.log &
```
在Client端(容器内或容器外,需要安装`paddle-serving-client`包)运行下面Python代码:
```python
from paddle_serving_client import Client
client = Client()
client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
client.connect(["127.0.0.1:9292"])
data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
fetch_map = client.predict(feed={"x": data}, fetch=["price"])
print(fetch_map)
```
## GPU版本
GPU版本与CPU版本基本一致,只有部分接口命名的差别(GPU版本需要在GPU机器上安装nvidia-docker)。
### 获取镜像
参考[该文档](DOCKER_IMAGES_CN.md)获取镜像,这里以 `cuda9.0-cudnn7` 的镜像为例:
```shell
nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
```
## GPU 版本
### 创建容器并进入
......@@ -107,74 +43,19 @@ nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
nvidia-docker exec -it test bash
```
`-p`选项是为了将容器的`9292`端口映射到宿主机的`9292`端口。
### 安装PaddleServing
为了减小镜像的体积,镜像中没有安装Serving包,要执行下面命令进行安装。
或者
```bash
pip install paddle-serving-server-gpu
```
您可能需要使用国内镜像源(例如清华源)来加速下载。
```shell
pip install paddle-serving-server-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple
```
### 测试example
在运行GPU版Server时需要通过`--gpu_ids`选项设置预测服务使用的GPU,缺省状态默认使用CPU。当设置的`--gpu_ids`超出环境变量`CUDA_VISIBLE_DEVICES`时会报错。下面的示例为指定使用索引为0的GPU:
```shell
export CUDA_VISIBLE_DEVICES=0,1
python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9292 --gpu_ids 0
```
通过下面命令获取训练好的Boston房价预估模型:
```bash
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
docker run --gpus all -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
docker exec -it test bash
```
- 测试HTTP服务
在Server端(容器内)运行:
```bash
python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --name uci --gpu_ids 0
```
在Client端(容器内或容器外)运行:
```bash
curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
```
- 测试RPC服务
在Server端(容器内)运行:
```bash
python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0
```
`-p`选项是为了将容器的`9292`端口映射到宿主机的`9292`端口。
在Client端(容器内或容器外,需要安装`paddle-serving-client`包)运行下面Python代码:
### 安装PaddleServing
```bash
from paddle_serving_client import Client
镜像里自带对应镜像tag版本的`paddle_serving_server_gpu``paddle_serving_client``paddle_serving_app`,如果用户不需要更改版本,可以直接使用,适用于没有外网服务的环境。
client = Client()
client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
client.connect(["127.0.0.1:9292"])
data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
fetch_map = client.predict(feed={"x": data}, fetch=["price"])
print(fetch_map)
```
如果需要更换版本,请参照首页的指导,下载对应版本的pip包。
## 注意事项
......
......@@ -49,4 +49,4 @@ Arguments are the same as `inference_model_to_serving` API.
| `serving_server` | str | `"serving_server"` | The path of model files and configuration files for server. |
| `serving_client` | str | `"serving_client"` | The path of configuration files for client. |
| `model_filename` | str | None | The name of file to load the inference program. If it is None, the default filename `__model__` will be used. |
| `paras_filename` | str | None | The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. |
| `params_filename` | str | None | The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. |
......@@ -50,4 +50,4 @@ python -m paddle_serving_client.convert --dirname ./your_inference_model_dir
| `serving_server` | str | `"serving_server"` | 转换后的模型文件和配置文件的存储路径。默认值为serving_server |
| `serving_client` | str | `"serving_client"` | 转换后的客户端配置文件存储路径。默认值为serving_client |
| `model_filename` | str | None | 存储需要转换的模型Inference Program结构的文件名称。如果设置为None,则使用 `__model__` 作为默认的文件名 |
| `paras_filename` | str | None | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的二进制文件中,它才需要被指定。如果模型参数是存储在各自分离的文件中,设置它的值为None |
| `params_filename` | str | None | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的二进制文件中,它才需要被指定。如果模型参数是存储在各自分离的文件中,设置它的值为None |
......@@ -8,7 +8,7 @@ This document guides users how to build Paddle Serving service on the Windows pl
### Running Paddle Serving on Native Windows System
**Configure Python environment variables to PATH**: First, you need to add the directory where the Python executable program is located to the PATH. Usually in **System Properties/My Computer Properties**-**Advanced**-**Environment Variables**, click Path and add the path at the beginning. For example, `C:\Users\$USER\AppData\Local\Programs\Python\Python36`, and finally click **OK** continuously. If you enter python on Powershell, you can enter the python interactive interface, indicating that the environment variable configuration is successful.
**Configure Python environment variables to PATH**: **We only support Python 3.5+ on Native Windows System.**. First, you need to add the directory where the Python executable program is located to the PATH. Usually in **System Properties/My Computer Properties**-**Advanced**-**Environment Variables**, click Path and add the path at the beginning. For example, `C:\Users\$USER\AppData\Local\Programs\Python\Python36`, and finally click **OK** continuously. If you enter python on Powershell, you can enter the python interactive interface, indicating that the environment variable configuration is successful.
**Install wget**: Because all the downloads in the tutorial and the built-in model download function in `paddle_serving_app` all use the wget tool, download the binary package at the [link](http://gnuwin32.sourceforge.net/packages/wget.htm), unzip and copy it to `C:\Windows\System32`, if there is a security prompt, you need to pass it.
......@@ -32,6 +32,7 @@ python -m pip install -U paddle_serving_server_gpu paddle_serving_client paddle_
```
git clone https://github.com/paddlepaddle/Serving
pip install -r python/requirements_win.txt
```
**Run OCR example**:
......
......@@ -8,7 +8,7 @@
### 原生Windows系统运行Paddle Serving
**配置Python环境变量到PATH**:首先需要将Python的可执行程序所在目录加入到PATH当中。通常在**系统属性/我的电脑属性**-**高级**-**环境变量** ,点选Path,并在开头加上路径。例如`C:\Users\$USER\AppData\Local\Programs\Python\Python36`,最后连续点击**确定** 。在Powershell上如果输入python可以进入python交互界面,说明环境变量配置成功。
**配置Python环境变量到PATH****目前原生Windows仅支持Python 3.5或更高版本**首先需要将Python的可执行程序所在目录加入到PATH当中。通常在**系统属性/我的电脑属性**-**高级**-**环境变量** ,点选Path,并在开头加上路径。例如`C:\Users\$USER\AppData\Local\Programs\Python\Python36`,最后连续点击**确定** 。在Powershell上如果输入python可以进入python交互界面,说明环境变量配置成功。
**安装wget工具**:由于教程当中所有的下载,以及`paddle_serving_app`当中内嵌的模型下载功能,都是用到wget工具,在链接[下载wget](http://gnuwin32.sourceforge.net/packages/wget.htm),解压后复制到`C:\Windows\System32`下,如有安全提示需要通过。
......@@ -32,6 +32,7 @@ python -m pip install -U paddle_serving_server_gpu paddle_serving_client paddle_
```
git clone https://github.com/paddlepaddle/Serving
pip install -r python/requirements_win.txt
```
**运行OCR示例**
......
## Java Demo
## Tutorial of Java Client for Paddle Serving
(English|[简体中文](./README_CN.md))
### Development Environment
In order to facilitate users to use java for development, we provide the compiled Serving project to be placed in the java mirror. The way to get the mirror and enter the development environment is
```
docker pull hub.baidubce.com/paddlepaddle/serving:0.4.0-java
docker run --rm -dit --name java_serving hub.baidubce.com/paddlepaddle/serving:0.4.0-java
docker exec -it java_serving bash
cd Serving/java
```
The Serving folder is at the develop branch when the docker image is generated. You need to git pull to the latest version or git checkout to the desired branch.
### Install client dependencies
Due to the large number of dependent libraries, the image has been compiled once at the time of generation, and the user can perform the following operations
### Install package
```
mvn compile
mvn install
......@@ -9,18 +27,93 @@ mvn compile
mvn install
```
### Start Server
### Start the server(not pipeline)
take the fit_a_line demo as example
Take the fit_a_line model as an example, the server starts
```
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #CPU
python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #GPU
cd ../../python/examples/fit_a_line
sh get_data.sh
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang &
```
### Client Predict
Client prediction
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample fit_a_line
```
The Java example also contains the prediction client of Bert, Model_enaemble, asyn_predict, batch_predict, Cube_local, Cube_quant, and Yolov4 models.
Take yolov4 as an example, the server starts
```
python -m paddle_serving_app.package --get_model yolov4
tar -xzvf yolov4.tar.gz
python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang & #It needs to be executed in GPU Docker, otherwise the execution method of CPU must be used.
```
Client prediction
```
# in /Serving/java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample yolov4 ../../../python/examples/yolov4/000000570688.jpg
# The case of yolov4 needs to specify a picture as input
```
### Start the server(pipeline)
as for input data type = string,take IMDB model ensemble as an example,the server starts
```
cd ../../python/examples/pipeline/imdb_model_ensemble
sh get_data.sh
python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.log &
python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
python test_pipeline_server.py &>pipeline.log &
```
Client prediction(Synchronous)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample string_imdb_predict
```
Client prediction(Asynchronous)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample asyn_predict
```
as for input data type = INDArray,take uci_housing_model as an example,the server starts
```
cd ../../python/examples/pipeline/simple_web_service
sh get_data.sh
python web_service_java.py &>log.txt &
```
Client prediction(Synchronous)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample indarray_predict
```
### Customization guidance
The above example is running in CPU mode. If GPU mode is required, there are two options.
The first is that GPU Serving and Java Client are in the same image. After starting the corresponding image, the user needs to move /Serving/java in the java image to the corresponding image.
The second is to deploy GPU Serving and Java Client separately. If they are on the same host, you can learn the IP address of the corresponding container through ifconfig, and then when you connect to client.connect in `examples/src/main/java/PaddleServingClientExample.java` Make changes to the endpoint, and then compile it again. Or select `--net=host` to bind the network device of docker and host when docker starts, so that it can run directly without customizing java code.
**It should be noted that in the example, all models need to use `--use_multilang` to start GRPC multi-programming language support, and the port number is 9393. If you need another port, you need to modify it in the java file**
**Currently Serving has launched the Pipeline mode (see [Pipeline Serving](../doc/PIPELINE_SERVING.md) for details). Pipeline Serving Client for Java is released, the next version multi-thread java client example will be released**
**It should be noted that in the example, Java Pipeline Client code is in path /Java/Examples and /Java/src/main, and the Pipeline server code is in path /python/examples/pipeline/**
## Java 示例
## 用于Paddle Serving的Java客户端
([English](./README.md)|简体中文)
### 开发环境
为了方便用户使用java进行开发,我们提供了编译好的Serving工程放置在java镜像当中,获取镜像并进入开发环境的方式是
```
docker pull hub.baidubce.com/paddlepaddle/serving:0.4.0-java
docker run --rm -dit --name java_serving hub.baidubce.com/paddlepaddle/serving:0.4.0-java
docker exec -it java_serving bash
cd Serving/java
```
Serving文件夹是镜像生成时的develop分支工程目录,需要git pull 到最新版本,或者git checkout 到想要的分支。
### 安装客户端依赖
由于依赖库数量庞大,因此镜像已经在生成时编译过一次,用户执行以下操作即可
```
mvn compile
mvn install
......@@ -9,18 +27,95 @@ mvn compile
mvn install
```
### 启动服务端
### 启动服务端(非pipeline方式)
以fit_a_line模型为例
以fit_a_line模型为例,服务端启动
```
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #CPU
python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #GPU
cd ../../python/examples/fit_a_line
sh get_data.sh
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang &
```
### 客户端预测
客户端预测
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample fit_a_line
```
java示例中还包含了bert、model_enaemble、asyn_predict、batch_predict、cube_local、cube_quant、yolov4模型的预测客户端。
以yolov4为例子,服务端启动
```
python -m paddle_serving_app.package --get_model yolov4
tar -xzvf yolov4.tar.gz
python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang & #需要在GPU Docker当中执行,否则要使用CPU的执行方式。
```
客户端预测
```
# in /Serving/java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample yolov4 ../../../python/examples/yolov4/000000570688.jpg
# yolov4的案例需要指定一个图片作为输入
```
### 启动服务端(Pipeline方式)
对于input data type = string类型,以IMDB model ensemble模型为例,服务端启动
```
cd ../../python/examples/pipeline/imdb_model_ensemble
sh get_data.sh
python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.log &
python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
python test_pipeline_server.py &>pipeline.log &
```
客户端预测(同步)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample string_imdb_predict
```
客户端预测(异步)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample asyn_predict
```
对于input data type = INDArray类型,以Simple Pipeline WebService中的uci_housing_model模型为例,服务端启动
```
cd ../../python/examples/pipeline/simple_web_service
sh get_data.sh
python web_service_java.py &>log.txt &
```
客户端预测(同步)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample indarray_predict
```
### 二次开发指导
上述示例是在CPU模式下运行,如果需要GPU模式,可以有两种选择。
第一种是GPU Serving和Java Client在同一个镜像,需要用户在启动对应的镜像后,把java镜像当中的/Serving/java移动到对应的镜像中。
第二种是GPU Serving和Java Client分开部署,如果在同一台宿主机,可以通过ifconfig了解对应容器的IP地址,然后在`examples/src/main/java/PaddleServingClientExample.java`当中对client.connect时的endpoint做修改,然后再编译一次。 或者在docker启动时选择 `--net=host`来绑定docker和宿主机的网络设备,这样不需要定制java代码可以直接运行。
**需要注意的是,在示例中,所有模型都需要使用`--use_multilang`来启动GRPC多编程语言支持,以及端口号都是9393,如果需要别的端口,需要在java文件里修改**
**目前Serving已推出Pipeline模式(详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)),面向Java的Pipeline Serving Client已发布,下个更新会发布Java版本的多线程用例敬请期待。**
**需要注意的是,Java Pipeline Client相关示例在/Java/Examples和/Java/src/main中,对应的Pipeline server在/python/examples/pipeline/中**
**目前Serving已推出Pipeline模式(详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)),下个版本(0.4.1)面向Java的Pipeline Serving Client将会发布,敬请期待。**
import io.paddle.serving.pipelineclient.*;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import org.nd4j.linalg.api.iter.NdIndexIterator;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.datavec.image.loader.NativeImageLoader;
import org.nd4j.linalg.api.ops.CustomOp;
import org.nd4j.linalg.api.ops.DynamicCustomOp;
import org.nd4j.linalg.factory.Nd4j;
import java.util.*;
/**
* this class give an example for using the client to predict(grpc)
* StaticPipelineClient.client supports mutil-thread.
* By setting StaticPipelineClient.client properties,you can change the Maximum concurrency
* Do not need to generate multiple instances of client,Use the StaticPipelineClient.client or SingleTon instead.
* @author HexToString
*/
public class PipelineClientExample {
/**
* This method gives an example of synchronous prediction whose input type is string.
*/
boolean string_imdb_predict() {
HashMap<String, String> feed_data
= new HashMap<String, String>() {{
put("words", "i am very sad | 0");
}};
System.out.println(feed_data);
List<String> fetch = Arrays.asList("prediction");
System.out.println(fetch);
if (StaticPipelineClient.succ != true) {
if(!StaticPipelineClient.initClient("172.17.0.2","18070")){
System.out.println("connect failed.");
return false;
}
}
HashMap<String,String> result = StaticPipelineClient.client.predict(feed_data, fetch,false,0);
if (result == null) {
return false;
}
System.out.println(result);
return true;
}
/**
* This method gives an example of asynchronous prediction whose input type is string.
*/
boolean asyn_predict() {
HashMap<String, String> feed_data
= new HashMap<String, String>() {{
put("words", "i am very sad | 0");
}};
System.out.println(feed_data);
List<String> fetch = Arrays.asList("prediction");
System.out.println(fetch);
if (StaticPipelineClient.succ != true) {
if(!StaticPipelineClient.initClient("172.17.0.2","18070")){
System.out.println("connect failed.");
return false;
}
}
PipelineFuture future = StaticPipelineClient.client.asyn_pr::qedict(feed_data, fetch,false,0);
HashMap<String,String> result = future.get();
if (result == null) {
return false;
}
System.out.println(result);
return true;
}
/**
* This method gives an example of synchronous prediction whose input type is Array or list or matrix.
* use Nd4j.createFromArray method to convert Array to INDArray.
* use convertINDArrayToString method to convert INDArray to specified String type(for python Numpy eval method).
*/
boolean indarray_predict() {
float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f, 0.0582f, -0.0727f, -0.1583f, -0.0584f, 0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
INDArray npdata = Nd4j.createFromArray(data);
HashMap<String, String> feed_data
= new HashMap<String, String>() {{
put("x", convertINDArrayToString(npdata));
}};
List<String> fetch = Arrays.asList("prediction");
if (StaticPipelineClient.succ != true) {
if(!StaticPipelineClient.initClient("172.17.0.2","9998")){
System.out.println("connect failed.");
return false;
}
}
HashMap<String,String> result = StaticPipelineClient.client.predict(feed_data, fetch,false,0);
if (result == null) {
return false;
}
System.out.println(result);
return true;
}
/**
* This method convert INDArray to specified String type.
* @param npdata INDArray type(The input data).
* @return String (specified String type for python Numpy eval method).
*/
String convertINDArrayToString(INDArray npdata){
return "array("+npdata.toString()+")";
}
/**
* This method is entry function.
* @param args String[] type(Command line parameters)
*/
public static void main( String[] args ) {
PipelineClientExample e = new PipelineClientExample();
boolean succ = false;
if (args.length < 1) {
System.out.println("Usage: java -cp <jar> PaddleServingClientExample <test-type>.");
System.out.println("<test-type>: fit_a_line bert model_ensemble asyn_predict batch_predict cube_local cube_quant yolov4");
return;
}
String testType = args[0];
System.out.format("[Example] %s\n", testType);
if ("string_imdb_predict".equals(testType)) {
succ = e.string_imdb_predict();
}else if ("asyn_predict".equals(testType)) {
succ = e.asyn_predict();
}else if ("indarray_predict".equals(testType)) {
succ = e.indarray_predict();
} else {
System.out.format("test-type(%s) not match.\n", testType);
return;
}
if (succ == true) {
System.out.println("[Example] succ.");
} else {
System.out.println("[Example] fail.");
}
}
}
import io.paddle.serving.pipelineclient.*;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import org.nd4j.linalg.api.iter.NdIndexIterator;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.datavec.image.loader.NativeImageLoader;
import org.nd4j.linalg.api.ops.CustomOp;
import org.nd4j.linalg.api.ops.DynamicCustomOp;
import org.nd4j.linalg.factory.Nd4j;
import java.util.*;
/**
* static resource management class
* @author HexToString
*/
public class StaticPipelineClient {
/**
* Static Variable PipelineClient
*/
public static PipelineClient client = new PipelineClient();
/**
* the sign of connect status
*/
public static boolean succ = false;
/**
* This method returns the sign of connect status.
* @param strIp String type(The server ipv4) such as "192.168.10.10".
* @param strPort String type(The server port) such as "8891".
* @return boolean (the sign of connect status).
*/
public static boolean initClient(String strIp,String strPort){
String target = strIp+ ":"+ strPort;//"172.17.0.2:18070";
System.out.println("initial connect.");
if(succ){
System.out.println("already connect.");
return true;
}
succ = clieint.connect(target);
if (succ != true) {
System.out.println("connect failed.");
return false;
}
return true;
}
}
package io.paddle.serving.pipelineclient;
import java.util.*;
import java.util.function.Function;
import java.lang.management.ManagementFactory;
import java.lang.management.RuntimeMXBean;
import io.grpc.ManagedChannel;
import io.grpc.ManagedChannelBuilder;
import io.grpc.StatusRuntimeException;
import com.google.protobuf.ByteString;
import com.google.common.util.concurrent.ListenableFuture;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.iter.NdIndexIterator;
import org.nd4j.linalg.factory.Nd4j;
import io.paddle.serving.pipelineproto.*;
import io.paddle.serving.pipelineclient.PipelineFuture;
/**
* PipelineClient class defination
* @author HexToString
*/
public class PipelineClient {
private ManagedChannel channel_;
private PipelineServiceGrpc.PipelineServiceBlockingStub blockingStub_;
private PipelineServiceGrpc.PipelineServiceFutureStub futureStub_;
private String clientip;
private String _profile_key;
private String _profile_value;
public PipelineClient() {
channel_ = null;
blockingStub_ = null;
futureStub_ = null;
boolean is_profile = false;
clientip = null;
_profile_value = "1";
_profile_key = "pipeline.profile";
}
/**
* This method returns the sign of connect status.
* @param target String type(The server ipv4 and port) such as "192.168.10.10:8891".
* @return boolean (the sign of connect status).
*/
public boolean connect(String target) {
try {
String[] temp = target.split(":");
this.clientip = temp[0] == "localhost"?"127.0.0.1":temp[0];
channel_ = ManagedChannelBuilder.forTarget(target)
.defaultLoadBalancingPolicy("round_robin")
.maxInboundMessageSize(Integer.MAX_VALUE)
.usePlaintext()
.build();
blockingStub_ = PipelineServiceGrpc.newBlockingStub(channel_);
futureStub_ = PipelineServiceGrpc.newFutureStub(channel_);
} catch (Exception e) {
System.out.format("Connect failed: %s\n", e.toString());
return false;
}
return true;
}
/**
* This method returns the Packaged Request.
* @param feed_dict HashMap<String, String>(input data).
* @param profile boolean(profile sign).
* @param logid int
* @return Request (the grpc protobuf Request).
*/
private Request _packInferenceRequest(
HashMap<String, String> feed_dict,
boolean profile,
int logid) throws IllegalArgumentException {
List<String> keys = new ArrayList<String>();
List<String> values = new ArrayList<String>();
long[] flattened_shape = {-1};
Request.Builder req_builder = Request.newBuilder()
.setClientip(this.clientip)
.setLogid(logid);
for (Map.Entry<String, String> entry : feed_dict.entrySet()) {
keys.add(entry.getKey());
values.add(entry.getValue());
}
if(profile){
keys.add(_profile_key);
values.add(_profile_value);
}
req_builder.addAllKey(keys);
req_builder.addAllValue(values);
return req_builder.build();
}
/**
* This method returns the HashMap which is unpackaged from Response.
* @param resp Response(the grpc protobuf Response).
* @return HashMap<String,String> (the output).
*/
private HashMap<String,String> _unpackResponse(Response resp) throws IllegalArgumentException{
return PipelineClient._staitcUnpackResponse(resp);
}
/**
* This static method returns the HashMap which is unpackaged from Response.
* @param resp Response(the grpc protobuf Response).
* @return HashMap<String,String> (the output).
*/
private static HashMap<String,String> _staitcUnpackResponse(Response resp) {
HashMap<String,String> ret_Map = new HashMap<String,String>();
int err_no = resp.getErrNo();
if ( err_no!= 0) {
return null;
}
List<String> keys = resp.getKeyList();
List<String> values= resp.getValueList();
for (int i = 0;i<keys.size();i++) {
ret_Map.put(keys.get(i),values.get(i));
}
return ret_Map;
}
/**
* The synchronous prediction method.
* @param feed_batch HashMap<String, String>(input data).
* @param fetch Iterable<String>(the output key list).
* @param profile boolean(profile sign).
* @param logid int
* @return HashMap<String,String> (the output).
*/
public HashMap<String,String> predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
boolean profile,
int logid) {
try {
Request req = _packInferenceRequest(
feed_batch, profile,logid);
Response resp = blockingStub_.inference(req);
return _unpackResponse(resp);
} catch (StatusRuntimeException e) {
System.out.format("Failed to predict: %s\n", e.toString());
return null;
}
}
/**
* The synchronous prediction overload function.
*/
public HashMap<String,String> predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch) {
return predict(feed_batch,fetch,false,0);
}
/**
* The synchronous prediction overload function.
*/
public HashMap<String,String> predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
boolean profile) {
return predict(feed_batch,fetch,profile,0);
}
/**
* The synchronous prediction overload function.
*/
public HashMap<String,String> predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
int logid) {
return predict(feed_batch,fetch,false,logid);
}
/**
* The asynchronous prediction method.use future.get() to get the result.
* @param feed_batch HashMap<String, String>(input data).
* @param fetch Iterable<String>(the output key list).
* @param profile boolean(profile sign).
* @param logid int
* @return PipelineFuture(the output future).
*/
public PipelineFuture asyn_predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
boolean profile,
int logid) {
Request req = _packInferenceRequest(
feed_batch, profile, logid);
ListenableFuture<Response> future = futureStub_.inference(req);
PipelineFuture predict_future = new PipelineFuture(future,
(Response resp) -> {
return PipelineClient._staitcUnpackResponse(resp);
}
);
return predict_future;
}
/**
* The asynchronous prediction overload function.
*/
public PipelineFuture asyn_predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch) {
return asyn_predict(feed_batch,fetch,false,0);
}
/**
* The asynchronous prediction overload function.
*/
public PipelineFuture asyn_predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
boolean profile) {
return asyn_predict(feed_batch,fetch,profile,0);
}
/**
* The asynchronous prediction overload function.
*/
public PipelineFuture asyn_predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
int logid) {
return asyn_predict(feed_batch,fetch,false,logid);
}
}
package io.paddle.serving.pipelineclient;
import java.util.*;
import java.util.function.Function;
import io.grpc.StatusRuntimeException;
import com.google.common.util.concurrent.ListenableFuture;
import org.nd4j.linalg.api.ndarray.INDArray;
import io.paddle.serving.pipelineclient.PipelineClient;
import io.paddle.serving.pipelineproto.*;
/**
* PipelineFuture class is for asynchronous prediction
* @author HexToString
*/
public class PipelineFuture {
private ListenableFuture<Response> callFuture_;
private Function<Response,
HashMap<String,String> > callBackFunc_;
PipelineFuture(ListenableFuture<Response> call_future,
Function<Response,
HashMap<String,String> > call_back_func) {
callFuture_ = call_future;
callBackFunc_ = call_back_func;
}
/**
* use this method to get the result of asynchronous prediction.
*/
public HashMap<String,String> get() {
Response resp = null;
try {
resp = callFuture_.get();
} catch (Exception e) {
System.out.format("predict failed: %s\n", e.toString());
return null;
}
HashMap<String,String> result
= callBackFunc_.apply(resp);
return result;
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
option java_multiple_files = true;
option java_package = "io.paddle.serving.pipelineproto";
option java_outer_classname = "PipelineProto";
package baidu.paddle_serving.pipeline_serving;
message Request {
repeated string key = 1;
repeated string value = 2;
optional string name = 3;
optional string method = 4;
optional int64 logid = 5;
optional string clientip = 6;
};
message Response {
optional int32 err_no = 1;
optional string err_msg = 2;
repeated string key = 3;
repeated string value = 4;
};
service PipelineService {
rpc inference(Request) returns (Response) {}
};
......@@ -13,8 +13,13 @@
# limitations under the License
if (NOT CLIENT_ONLY)
add_subdirectory(inferencer-fluid-cpu)
if (WITH_GPU)
add_subdirectory(inferencer-fluid-gpu)
endif()
add_subdirectory(inferencer-fluid-cpu)
if (WITH_GPU)
add_subdirectory(inferencer-fluid-gpu)
endif()
if (WITH_LITE)
add_subdirectory(inferencer-fluid-arm)
endif()
endif()
FILE(GLOB fluid_arm_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
add_library(fluid_arm_engine ${fluid_arm_engine_srcs})
target_include_directories(fluid_arm_engine PUBLIC
${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
add_dependencies(fluid_arm_engine pdserving extern_paddle configure)
target_link_libraries(fluid_arm_engine pdserving paddle_fluid -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
install(TARGETS fluid_arm_engine
ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pthread.h>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include "core/configure/include/configure_parser.h"
#include "core/configure/inferencer_configure.pb.h"
#include "core/predictor/framework/infer.h"
#include "paddle_inference_api.h" // NOLINT
namespace baidu {
namespace paddle_serving {
namespace fluid_arm {
class AutoLock {
public:
explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
pthread_mutex_lock(&mutex);
}
~AutoLock() { pthread_mutex_unlock(&_mut); }
private:
pthread_mutex_t& _mut;
};
class GlobalPaddleCreateMutex {
public:
pthread_mutex_t& mutex() { return _mut; }
static pthread_mutex_t& instance() {
static GlobalPaddleCreateMutex gmutex;
return gmutex.mutex();
}
private:
GlobalPaddleCreateMutex() { pthread_mutex_init(&_mut, NULL); }
pthread_mutex_t _mut;
};
using paddle_infer::Config;
using paddle_infer::Predictor;
using paddle_infer::Tensor;
using paddle_infer::PrecisionType;
using paddle_infer::CreatePredictor;
// data interface
class FluidFamilyCore {
public:
virtual ~FluidFamilyCore() {}
virtual std::vector<std::string> GetInputNames() {
return _core->GetInputNames();
}
virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
return _core->GetInputHandle(name);
}
virtual std::vector<std::string> GetOutputNames() {
return _core->GetOutputNames();
}
virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
return _core->GetOutputHandle(name);
}
virtual bool Run() {
if (!_core->Run()) {
LOG(ERROR) << "Failed call Run with paddle predictor";
return false;
}
return true;
}
virtual int create(const predictor::InferEngineCreationParams& params) = 0;
virtual int clone(void* origin_core) {
if (origin_core == NULL) {
LOG(ERROR) << "origin paddle Predictor is null.";
return -1;
}
Predictor* p_predictor = (Predictor*)origin_core;
_core = p_predictor->Clone();
if (_core.get() == NULL) {
LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
return -1;
}
return 0;
}
virtual void* get() { return _core.get(); }
protected:
std::shared_ptr<Predictor> _core;
};
// infer interface
class FluidArmAnalysisCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
Config config;
config.SetParamsFile(data_path + "/__params__");
config.SetProgFile(data_path + "/__model__");
config.DisableGpu();
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
config.EnableMemoryOptim();
}
if (params.enable_memory_optimization()) {
config.EnableMemoryOptim();
}
if (params.use_lite()) {
config.EnableLiteEngine(PrecisionType::kFloat32, true);
}
if (params.use_xpu()) {
config.EnableXpu(100);
}
config.SwitchSpecifyInputNames(true);
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidArmAnalysisDirCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
Config config;
config.SetModel(data_path);
config.DisableGpu();
config.SwitchSpecifyInputNames(true);
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
config.EnableMemoryOptim();
}
if (params.enable_ir_optimization()) {
config.SwitchIrOptim(true);
} else {
config.SwitchIrOptim(false);
}
if (params.use_lite()) {
config.EnableLiteEngine(PrecisionType::kFloat32, true);
}
if (params.use_xpu()) {
config.EnableXpu(100);
}
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class Parameter {
public:
Parameter() : _row(0), _col(0), _params(NULL) {}
~Parameter() {
VLOG(2) << "before destroy Parameter, file_name[" << _file_name << "]";
destroy();
}
int init(int row, int col, const char* file_name) {
destroy();
_file_name = file_name;
_row = row;
_col = col;
_params = reinterpret_cast<float*>(malloc(_row * _col * sizeof(float)));
if (_params == NULL) {
LOG(ERROR) << "Load " << _file_name << " malloc error.";
return -1;
}
VLOG(2) << "Load parameter file[" << _file_name << "] success.";
return 0;
}
void destroy() {
_row = 0;
_col = 0;
if (_params != NULL) {
free(_params);
_params = NULL;
}
}
int load() {
if (_params == NULL || _row <= 0 || _col <= 0) {
LOG(ERROR) << "load parameter error [not inited].";
return -1;
}
FILE* fs = fopen(_file_name.c_str(), "rb");
if (fs == NULL) {
LOG(ERROR) << "load " << _file_name << " fopen error.";
return -1;
}
static const uint32_t MODEL_FILE_HEAD_LEN = 16;
char head[MODEL_FILE_HEAD_LEN] = {0};
if (fread(head, 1, MODEL_FILE_HEAD_LEN, fs) != MODEL_FILE_HEAD_LEN) {
destroy();
LOG(ERROR) << "Load " << _file_name << " read head error.";
if (fs != NULL) {
fclose(fs);
fs = NULL;
}
return -1;
}
uint32_t matrix_size = _row * _col;
if (matrix_size == fread(_params, sizeof(float), matrix_size, fs)) {
if (fs != NULL) {
fclose(fs);
fs = NULL;
}
VLOG(2) << "load " << _file_name << " read ok.";
return 0;
} else {
LOG(ERROR) << "load " << _file_name << " read error.";
destroy();
if (fs != NULL) {
fclose(fs);
fs = NULL;
}
return -1;
}
return 0;
}
public:
std::string _file_name;
int _row;
int _col;
float* _params;
};
} // namespace fluid_arm
} // namespace paddle_serving
} // namespace baidu
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h"
#include "core/predictor/framework/factory.h"
namespace baidu {
namespace paddle_serving {
namespace fluid_arm {
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<FluidArmAnalysisCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_ARM_ANALYSIS");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<
FluidArmAnalysisDirCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_ARM_ANALYSIS_DIR");
} // namespace fluid_arm
} // namespace paddle_serving
} // namespace baidu
......@@ -28,8 +28,6 @@ namespace baidu {
namespace paddle_serving {
namespace fluid_cpu {
using configure::SigmoidConf;
class AutoLock {
public:
explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
......@@ -57,31 +55,36 @@ class GlobalPaddleCreateMutex {
pthread_mutex_t _mut;
};
class GlobalSigmoidCreateMutex {
public:
pthread_mutex_t& mutex() { return _mut; }
static pthread_mutex_t& instance() {
static GlobalSigmoidCreateMutex gmutex;
return gmutex.mutex();
}
private:
GlobalSigmoidCreateMutex() { pthread_mutex_init(&_mut, NULL); }
pthread_mutex_t _mut;
};
using paddle_infer::Config;
using paddle_infer::Predictor;
using paddle_infer::Tensor;
using paddle_infer::CreatePredictor;
// data interface
class FluidFamilyCore {
public:
virtual ~FluidFamilyCore() {}
virtual bool Run(const void* in_data, void* out_data) {
if (!_core->Run(*(std::vector<paddle::PaddleTensor>*)in_data,
(std::vector<paddle::PaddleTensor>*)out_data)) {
virtual std::vector<std::string> GetInputNames() {
return _core->GetInputNames();
}
virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
return _core->GetInputHandle(name);
}
virtual std::vector<std::string> GetOutputNames() {
return _core->GetOutputNames();
}
virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
return _core->GetOutputHandle(name);
}
virtual bool Run() {
if (!_core->Run()) {
LOG(ERROR) << "Failed call Run with paddle predictor";
return false;
}
return true;
}
......@@ -92,8 +95,7 @@ class FluidFamilyCore {
LOG(ERROR) << "origin paddle Predictor is null.";
return -1;
}
paddle::PaddlePredictor* p_predictor =
(paddle::PaddlePredictor*)origin_core;
Predictor* p_predictor = (Predictor*)origin_core;
_core = p_predictor->Clone();
if (_core.get() == NULL) {
LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
......@@ -105,7 +107,7 @@ class FluidFamilyCore {
virtual void* get() { return _core.get(); }
protected:
std::unique_ptr<paddle::PaddlePredictor> _core;
std::shared_ptr<Predictor> _core;
};
// infer interface
......@@ -119,51 +121,19 @@ class FluidCpuAnalysisCore : public FluidFamilyCore {
return -1;
}
paddle::AnalysisConfig analysis_config;
analysis_config.SetParamsFile(data_path + "/__params__");
analysis_config.SetProgFile(data_path + "/__model__");
analysis_config.DisableGpu();
analysis_config.SetCpuMathLibraryNumThreads(1);
Config config;
config.SetParamsFile(data_path + "/__params__");
config.SetProgFile(data_path + "/__model__");
config.DisableGpu();
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
}
analysis_config.SwitchSpecifyInputNames(true);
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidCpuNativeCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
config.EnableMemoryOptim();
}
paddle::NativeConfig native_config;
native_config.param_file = data_path + "/__params__";
native_config.prog_file = data_path + "/__model__";
native_config.use_gpu = false;
native_config.device = 0;
native_config.fraction_of_gpu_memory = 0;
config.SwitchSpecifyInputNames(true);
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
......@@ -184,54 +154,24 @@ class FluidCpuAnalysisDirCore : public FluidFamilyCore {
return -1;
}
paddle::AnalysisConfig analysis_config;
analysis_config.SetModel(data_path);
analysis_config.DisableGpu();
analysis_config.SwitchSpecifyInputNames(true);
analysis_config.SetCpuMathLibraryNumThreads(1);
Config config;
config.SetModel(data_path);
config.DisableGpu();
config.SwitchSpecifyInputNames(true);
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
config.EnableMemoryOptim();
}
if (params.enable_ir_optimization()) {
analysis_config.SwitchIrOptim(true);
config.SwitchIrOptim(true);
} else {
analysis_config.SwitchIrOptim(false);
}
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
config.SwitchIrOptim(false);
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidCpuNativeDirCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::NativeConfig native_config;
native_config.model_dir = data_path;
native_config.use_gpu = false;
native_config.device = 0;
native_config.fraction_of_gpu_memory = 0;
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
......@@ -323,209 +263,57 @@ class Parameter {
float* _params;
};
class SigmoidModel {
public:
~SigmoidModel() {}
int load(const char* sigmoid_w_file,
const char* sigmoid_b_file,
float exp_max,
float exp_min) {
AutoLock lock(GlobalSigmoidCreateMutex::instance());
if (0 != _sigmoid_w.init(2, 1, sigmoid_w_file) || 0 != _sigmoid_w.load()) {
LOG(ERROR) << "load params sigmoid_w failed.";
return -1;
}
VLOG(2) << "load sigmoid_w [" << _sigmoid_w._params[0] << "] ["
<< _sigmoid_w._params[1] << "].";
if (0 != _sigmoid_b.init(2, 1, sigmoid_b_file) || 0 != _sigmoid_b.load()) {
LOG(ERROR) << "load params sigmoid_b failed.";
return -1;
}
VLOG(2) << "load sigmoid_b [" << _sigmoid_b._params[0] << "] ["
<< _sigmoid_b._params[1] << "].";
_exp_max_input = exp_max;
_exp_min_input = exp_min;
return 0;
}
int softmax(float x, double& o) { // NOLINT
float _y0 = x * _sigmoid_w._params[0] + _sigmoid_b._params[0];
float _y1 = x * _sigmoid_w._params[1] + _sigmoid_b._params[1];
_y0 = (_y0 > _exp_max_input)
? _exp_max_input
: ((_y0 < _exp_min_input) ? _exp_min_input : _y0);
_y1 = (_y1 > _exp_max_input)
? _exp_max_input
: ((_y1 < _exp_min_input) ? _exp_min_input : _y1);
o = 1.0f / (1.0f + exp(_y0 - _y1));
return 0;
}
public:
Parameter _sigmoid_w;
Parameter _sigmoid_b;
float _exp_max_input;
float _exp_min_input;
};
class SigmoidFluidModel {
class FluidCpuAnalysisEncryptCore : public FluidFamilyCore {
public:
int softmax(float x, double& o) { // NOLINT
return _sigmoid_core->softmax(x, o);
} // NOLINT
std::unique_ptr<SigmoidFluidModel> Clone() {
std::unique_ptr<SigmoidFluidModel> clone_model;
clone_model.reset(new SigmoidFluidModel());
clone_model->_sigmoid_core = _sigmoid_core;
clone_model->_fluid_core = _fluid_core->Clone();
return std::move(clone_model); // NOLINT
void ReadBinaryFile(const std::string& filename, std::string* contents) {
std::ifstream fin(filename, std::ios::in | std::ios::binary);
fin.seekg(0, std::ios::end);
contents->clear();
contents->resize(fin.tellg());
fin.seekg(0, std::ios::beg);
fin.read(&(contents->at(0)), contents->size());
fin.close();
}
public:
std::unique_ptr<paddle::PaddlePredictor> _fluid_core;
std::shared_ptr<SigmoidModel> _sigmoid_core;
};
class FluidCpuWithSigmoidCore : public FluidFamilyCore {
public:
virtual ~FluidCpuWithSigmoidCore() {}
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string model_path = params.get_path();
size_t pos = model_path.find_last_of("/\\");
std::string conf_path = model_path.substr(0, pos);
std::string conf_file = model_path.substr(pos);
configure::SigmoidConf conf;
if (configure::read_proto_conf(conf_path, conf_file, &conf) != 0) {
LOG(ERROR) << "failed load model path: " << model_path;
return -1;
}
_core.reset(new SigmoidFluidModel);
std::string fluid_model_data_path = conf.dnn_model_path();
predictor::InferEngineCreationParams new_params(params);
new_params.set_path(fluid_model_data_path);
int ret = load_fluid_model(new_params);
if (ret < 0) {
LOG(ERROR) << "fail to load fluid model.";
return -1;
}
const char* sigmoid_w_file = conf.sigmoid_w_file().c_str();
const char* sigmoid_b_file = conf.sigmoid_b_file().c_str();
float exp_max = conf.exp_max_input();
float exp_min = conf.exp_min_input();
_core->_sigmoid_core.reset(new SigmoidModel);
VLOG(2) << "create sigmoid core[" << _core->_sigmoid_core.get()
<< "], use count[" << _core->_sigmoid_core.use_count() << "].";
ret = _core->_sigmoid_core->load(
sigmoid_w_file, sigmoid_b_file, exp_max, exp_min);
if (ret < 0) {
LOG(ERROR) << "fail to load sigmoid model.";
return -1;
}
return 0;
}
virtual bool Run(const void* in_data, void* out_data) {
if (!_core->_fluid_core->Run(
*(std::vector<paddle::PaddleTensor>*)in_data,
(std::vector<paddle::PaddleTensor>*)out_data)) {
LOG(ERROR) << "Failed call Run with paddle predictor";
return false;
}
return true;
}
virtual int clone(SigmoidFluidModel* origin_core) {
if (origin_core == NULL) {
LOG(ERROR) << "origin paddle Predictor is null.";
return -1;
}
_core = origin_core->Clone();
if (_core.get() == NULL) {
LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
return -1;
}
VLOG(2) << "clone sigmoid core[" << _core->_sigmoid_core.get()
<< "] use count[" << _core->_sigmoid_core.use_count() << "].";
return 0;
}
virtual SigmoidFluidModel* get() { return _core.get(); }
virtual int load_fluid_model(
const predictor::InferEngineCreationParams& params) = 0;
int softmax(float x, double& o) { // NOLINT
return _core->_sigmoid_core->softmax(x, o);
}
protected:
std::unique_ptr<SigmoidFluidModel> _core; // NOLINT
};
class FluidCpuNativeDirWithSigmoidCore : public FluidCpuWithSigmoidCore {
public:
int load_fluid_model(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
LOG(ERROR) << "create paddle predictor failed, path note exits: "
<< data_path;
return -1;
}
paddle::NativeConfig native_config;
native_config.model_dir = data_path;
native_config.use_gpu = false;
native_config.device = 0;
native_config.fraction_of_gpu_memory = 0;
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core->_fluid_core =
paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
std::string model_buffer, params_buffer, key_buffer;
ReadBinaryFile(data_path + "encrypt_model", &model_buffer);
ReadBinaryFile(data_path + "encrypt_params", &params_buffer);
ReadBinaryFile(data_path + "key", &key_buffer);
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
VLOG(2) << "prepare for encryption model";
class FluidCpuAnalysisDirWithSigmoidCore : public FluidCpuWithSigmoidCore {
public:
int load_fluid_model(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
auto cipher = paddle::MakeCipher("");
std::string real_model_buffer = cipher->Decrypt(model_buffer, key_buffer);
std::string real_params_buffer = cipher->Decrypt(params_buffer, key_buffer);
paddle::AnalysisConfig analysis_config;
analysis_config.SetModel(data_path);
Config analysis_config;
//paddle::AnalysisConfig analysis_config;
analysis_config.SetModelBuffer(&real_model_buffer[0],
real_model_buffer.size(),
&real_params_buffer[0],
real_params_buffer.size());
analysis_config.DisableGpu();
analysis_config.SwitchSpecifyInputNames(true);
analysis_config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
}
analysis_config.SwitchSpecifyInputNames(true);
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core->_fluid_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
VLOG(2) << "decrypt model file sucess";
_core =
CreatePredictor(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
......
......@@ -30,28 +30,13 @@ REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_CPU_ANALYSIS_DIR");
#if 1
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<
FluidCpuAnalysisDirWithSigmoidCore>,
FluidCpuAnalysisEncryptCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_CPU_ANALYSIS_DIR_SIGMOID");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<FluidCpuNativeCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_CPU_NATIVE");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<FluidCpuNativeDirCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_CPU_NATIVE_DIR");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<
FluidCpuNativeDirWithSigmoidCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_CPU_NATIVE_DIR_SIGMOID");
"FLUID_CPU_ANALYSIS_ENCRYPT");
#endif
} // namespace fluid_cpu
} // namespace paddle_serving
} // namespace baidu
......@@ -61,31 +61,36 @@ class GlobalPaddleCreateMutex {
pthread_mutex_t _mut;
};
class GlobalSigmoidCreateMutex {
public:
pthread_mutex_t& mutex() { return _mut; }
static pthread_mutex_t& instance() {
static GlobalSigmoidCreateMutex gmutex;
return gmutex.mutex();
}
private:
GlobalSigmoidCreateMutex() { pthread_mutex_init(&_mut, NULL); }
pthread_mutex_t _mut;
};
using paddle_infer::Config;
using paddle_infer::Predictor;
using paddle_infer::Tensor;
using paddle_infer::CreatePredictor;
// data interface
class FluidFamilyCore {
public:
virtual ~FluidFamilyCore() {}
virtual bool Run(const void* in_data, void* out_data) {
if (!_core->Run(*(std::vector<paddle::PaddleTensor>*)in_data,
(std::vector<paddle::PaddleTensor>*)out_data)) {
virtual std::vector<std::string> GetInputNames() {
return _core->GetInputNames();
}
virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
return _core->GetInputHandle(name);
}
virtual std::vector<std::string> GetOutputNames() {
return _core->GetOutputNames();
}
virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
return _core->GetOutputHandle(name);
}
virtual bool Run() {
if (!_core->Run()) {
LOG(ERROR) << "Failed call Run with paddle predictor";
return false;
}
return true;
}
......@@ -96,8 +101,7 @@ class FluidFamilyCore {
LOG(ERROR) << "origin paddle Predictor is null.";
return -1;
}
paddle::PaddlePredictor* p_predictor =
(paddle::PaddlePredictor*)origin_core;
Predictor* p_predictor = (Predictor*)origin_core;
_core = p_predictor->Clone();
if (_core.get() == NULL) {
LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
......@@ -109,7 +113,7 @@ class FluidFamilyCore {
virtual void* get() { return _core.get(); }
protected:
std::unique_ptr<paddle::PaddlePredictor> _core;
std::shared_ptr<Predictor> _core;
};
// infer interface
......@@ -123,51 +127,19 @@ class FluidGpuAnalysisCore : public FluidFamilyCore {
return -1;
}
paddle::AnalysisConfig analysis_config;
analysis_config.SetParamsFile(data_path + "/__params__");
analysis_config.SetProgFile(data_path + "/__model__");
analysis_config.EnableUseGpu(100, FLAGS_gpuid);
analysis_config.SetCpuMathLibraryNumThreads(1);
Config config;
config.SetParamsFile(data_path + "/__params__");
config.SetProgFile(data_path + "/__model__");
config.EnableUseGpu(100, FLAGS_gpuid);
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
config.EnableMemoryOptim();
}
analysis_config.SwitchSpecifyInputNames(true);
config.SwitchSpecifyInputNames(true);
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidGpuNativeCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::NativeConfig native_config;
native_config.param_file = data_path + "/__params__";
native_config.prog_file = data_path + "/__model__";
native_config.use_gpu = true;
native_config.fraction_of_gpu_memory = 0.01;
native_config.device = FLAGS_gpuid;
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
......@@ -188,110 +160,38 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
return -1;
}
paddle::AnalysisConfig analysis_config;
analysis_config.SetModel(data_path);
analysis_config.EnableUseGpu(1500, FLAGS_gpuid);
analysis_config.SwitchSpecifyInputNames(true);
analysis_config.SetCpuMathLibraryNumThreads(1);
Config config;
config.SetModel(data_path);
config.EnableUseGpu(1500, FLAGS_gpuid);
config.SwitchSpecifyInputNames(true);
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
}
#if 0 // todo: support flexible shape
int min_seq_len = 1;
int max_seq_len = 512;
int opt_seq_len = 128;
int head_number = 12;
int batch = 50;
std::vector<int> min_in_shape = {batch, min_seq_len, 1};
std::vector<int> max_in_shape = {batch, max_seq_len, 1};
std::vector<int> opt_in_shape = {batch, opt_seq_len, 1};
std::string input1_name = "src_text_a_ids";
std::string input2_name = "pos_text_a_ids";
std::string input3_name = "sent_text_a_ids";
std::string input4_name = "stack_0.tmp_0";
std::map<std::string, std::vector<int>> min_input_shape = {
{input1_name, min_in_shape},
{input2_name, min_in_shape},
{input3_name, min_in_shape},
{input4_name, {batch, head_number, min_seq_len, min_seq_len}},
};
std::map<std::string, std::vector<int>> max_input_shape = {
{input1_name, max_in_shape},
{input2_name, max_in_shape},
{input3_name, max_in_shape},
{input4_name, {batch, head_number, max_seq_len, max_seq_len}},
};
std::map<std::string, std::vector<int>> opt_input_shape = {
{input1_name, opt_in_shape},
{input2_name, opt_in_shape},
{input3_name, opt_in_shape},
{input4_name, {batch, head_number, opt_seq_len, opt_seq_len}},
};
analysis_config.SetTRTDynamicShapeInfo(
min_input_shape, max_input_shape, opt_input_shape);
#endif
config.EnableMemoryOptim();
}
int max_batch = 32;
int min_subgraph_size = 3;
if (params.use_trt()) {
analysis_config.EnableTensorRtEngine(
1 << 20,
config.EnableTensorRtEngine(1 << 20,
max_batch,
min_subgraph_size,
paddle::AnalysisConfig::Precision::kFloat32,
Config::Precision::kFloat32,
false,
false);
LOG(INFO) << "create TensorRT predictor";
} else {
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
config.EnableMemoryOptim();
}
if (params.enable_ir_optimization()) {
analysis_config.SwitchIrOptim(true);
config.SwitchIrOptim(true);
} else {
analysis_config.SwitchIrOptim(false);
config.SwitchIrOptim(false);
}
}
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidGpuNativeDirCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::NativeConfig native_config;
native_config.model_dir = data_path;
native_config.use_gpu = true;
native_config.fraction_of_gpu_memory = 0.01;
native_config.device = FLAGS_gpuid;
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
......@@ -383,214 +283,6 @@ class Parameter {
float* _params;
};
class SigmoidModel {
public:
~SigmoidModel() {}
int load(const char* sigmoid_w_file,
const char* sigmoid_b_file,
float exp_max,
float exp_min) {
AutoLock lock(GlobalSigmoidCreateMutex::instance());
if (0 != _sigmoid_w.init(2, 1, sigmoid_w_file) || 0 != _sigmoid_w.load()) {
LOG(ERROR) << "load params sigmoid_w failed.";
return -1;
}
VLOG(2) << "load sigmoid_w [" << _sigmoid_w._params[0] << "] ["
<< _sigmoid_w._params[1] << "].";
if (0 != _sigmoid_b.init(2, 1, sigmoid_b_file) || 0 != _sigmoid_b.load()) {
LOG(ERROR) << "load params sigmoid_b failed.";
return -1;
}
VLOG(2) << "load sigmoid_b [" << _sigmoid_b._params[0] << "] ["
<< _sigmoid_b._params[1] << "].";
_exp_max_input = exp_max;
_exp_min_input = exp_min;
return 0;
}
int softmax(float x, double& o) { // NOLINT
float _y0 = x * _sigmoid_w._params[0] + _sigmoid_b._params[0];
float _y1 = x * _sigmoid_w._params[1] + _sigmoid_b._params[1];
_y0 = (_y0 > _exp_max_input)
? _exp_max_input
: ((_y0 < _exp_min_input) ? _exp_min_input : _y0);
_y1 = (_y1 > _exp_max_input)
? _exp_max_input
: ((_y1 < _exp_min_input) ? _exp_min_input : _y1);
o = 1.0f / (1.0f + exp(_y0 - _y1));
return 0;
}
public:
Parameter _sigmoid_w;
Parameter _sigmoid_b;
float _exp_max_input;
float _exp_min_input;
};
class SigmoidFluidModel {
public:
int softmax(float x, double& o) { // NOLINT
return _sigmoid_core->softmax(x, o);
} // NOLINT
std::unique_ptr<SigmoidFluidModel> Clone() {
std::unique_ptr<SigmoidFluidModel> clone_model;
clone_model.reset(new SigmoidFluidModel());
clone_model->_sigmoid_core = _sigmoid_core;
clone_model->_fluid_core = _fluid_core->Clone();
return std::move(clone_model);
}
public:
std::unique_ptr<paddle::PaddlePredictor> _fluid_core;
std::shared_ptr<SigmoidModel> _sigmoid_core;
};
class FluidGpuWithSigmoidCore : public FluidFamilyCore {
public:
virtual ~FluidGpuWithSigmoidCore() {}
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string model_path = params.get_path();
size_t pos = model_path.find_last_of("/\\");
std::string conf_path = model_path.substr(0, pos);
std::string conf_file = model_path.substr(pos);
configure::SigmoidConf conf;
if (configure::read_proto_conf(conf_path, conf_file, &conf) != 0) {
LOG(ERROR) << "failed load model path: " << model_path;
return -1;
}
_core.reset(new SigmoidFluidModel);
std::string fluid_model_data_path = conf.dnn_model_path();
predictor::InferEngineCreationParams new_params(params);
new_params.set_path(fluid_model_data_path);
int ret = load_fluid_model(new_params);
if (ret < 0) {
LOG(ERROR) << "fail to load fluid model.";
return -1;
}
const char* sigmoid_w_file = conf.sigmoid_w_file().c_str();
const char* sigmoid_b_file = conf.sigmoid_b_file().c_str();
float exp_max = conf.exp_max_input();
float exp_min = conf.exp_min_input();
_core->_sigmoid_core.reset(new SigmoidModel);
LOG(INFO) << "create sigmoid core[" << _core->_sigmoid_core.get()
<< "], use count[" << _core->_sigmoid_core.use_count() << "].";
ret = _core->_sigmoid_core->load(
sigmoid_w_file, sigmoid_b_file, exp_max, exp_min);
if (ret < 0) {
LOG(ERROR) << "fail to load sigmoid model.";
return -1;
}
return 0;
}
virtual bool Run(const void* in_data, void* out_data) {
if (!_core->_fluid_core->Run(
*(std::vector<paddle::PaddleTensor>*)in_data,
(std::vector<paddle::PaddleTensor>*)out_data)) {
LOG(ERROR) << "Failed call Run with paddle predictor";
return false;
}
return true;
}
virtual int clone(SigmoidFluidModel* origin_core) {
if (origin_core == NULL) {
LOG(ERROR) << "origin paddle Predictor is null.";
return -1;
}
_core = origin_core->Clone();
if (_core.get() == NULL) {
LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
return -1;
}
LOG(INFO) << "clone sigmoid core[" << _core->_sigmoid_core.get()
<< "] use count[" << _core->_sigmoid_core.use_count() << "].";
return 0;
}
virtual SigmoidFluidModel* get() { return _core.get(); }
virtual int load_fluid_model(
const predictor::InferEngineCreationParams& params) = 0;
int softmax(float x, double& o) { // NOLINT
return _core->_sigmoid_core->softmax(x, o);
}
protected:
std::unique_ptr<SigmoidFluidModel> _core;
};
class FluidGpuNativeDirWithSigmoidCore : public FluidGpuWithSigmoidCore {
public:
int load_fluid_model(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::NativeConfig native_config;
native_config.model_dir = data_path;
native_config.use_gpu = true;
native_config.fraction_of_gpu_memory = 0.01;
native_config.device = FLAGS_gpuid;
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core->_fluid_core =
paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidGpuAnalysisDirWithSigmoidCore : public FluidGpuWithSigmoidCore {
public:
int load_fluid_model(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::AnalysisConfig analysis_config;
analysis_config.SetModel(data_path);
analysis_config.EnableUseGpu(100, FLAGS_gpuid);
analysis_config.SwitchSpecifyInputNames(true);
analysis_config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
}
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core->_fluid_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
} // namespace fluid_gpu
} // namespace paddle_serving
} // namespace baidu
......@@ -32,28 +32,6 @@ REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_GPU_ANALYSIS_DIR");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<
FluidGpuAnalysisDirWithSigmoidCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_GPU_ANALYSIS_DIR_SIGMOID");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<FluidGpuNativeCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_GPU_NATIVE");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<FluidGpuNativeDirCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_GPU_NATIVE_DIR");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<
FluidGpuNativeDirWithSigmoidCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_GPU_NATIVE_DIR_SIGMOID");
} // namespace fluid_gpu
} // namespace paddle_serving
} // namespace baidu
......@@ -7,7 +7,7 @@ if (CLIENT)
endif()
if (SERVER)
if (NOT WITH_GPU)
if (NOT WITH_GPU AND NOT WITH_LITE)
file(INSTALL pipeline DESTINATION paddle_serving_server)
file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
else()
......@@ -34,7 +34,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.app.in
endif()
if (SERVER)
if (NOT WITH_GPU)
if (NOT WITH_GPU AND NOT WITH_LITE)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
else()
......@@ -72,7 +72,7 @@ add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINA
endif()
if (SERVER)
if(NOT WITH_GPU)
if(NOT WITH_GPU AND NOT WITH_LITE)
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server/ ${PADDLE_SERVING_BINARY_DIR}/python/
......@@ -90,6 +90,16 @@ if (SERVER)
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
elseif(WITH_LITE)
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
COMMAND cp -r
${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
"server_gpu" arm
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
else()
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
......
## Criteo CTR with Sparse Parameter Indexing Service
([简体中文](./README_CN.md)|English)
### Get Sample Dataset
go to directory `python/examples/criteo_ctr_with_cube`
```
sh get_data.sh
```
### Download Model and Sparse Parameter Sequence Files
```
wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz
tar xf ctr_cube_unittest.tar.gz
mv models/ctr_client_conf ./
mv models/ctr_serving_model_kv ./
mv models/data ./cube/
```
the model will be in ./ctr_server_model_kv and ./ctr_client_config.
### Start Sparse Parameter Indexing Service
```
wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz
tar xf cube_app.tar.gz
mv cube_app/cube* ./cube/
sh cube_prepare.sh &
```
Here, the sparse parameter is loaded by cube sparse parameter indexing service Cube.
### Start RPC Predictor, the number of serving thread is 4(configurable in test_server.py)
```
python test_server.py ctr_serving_model_kv
```
### Run Prediction
```
python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
```
### Benchmark
CPU :Intel(R) Xeon(R) CPU 6148 @ 2.40GHz
Model :[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/criteo_ctr_with_cube/network_conf.py)
server core/thread num : 4/8
Run
```
bash benchmark.sh
```
1000 batches will be sent by every client
| client thread num | prepro | client infer | op0 | op1 | op2 | postpro | avg_latency | qps |
| ------------------ | ------ | ------------ | ------ | ----- | ------ | ------- | ----- | ----- |
| 1 | 0.035 | 1.596 | 0.021 | 0.518 | 0.0024 | 0.0025 | 6.774 | 147.7 |
| 2 | 0.034 | 1.780 | 0.027 | 0.463 | 0.0020 | 0.0023 | 6.931 | 288.3 |
| 4 | 0.038 | 2.954 | 0.025 | 0.455 | 0.0019 | 0.0027 | 8.378 | 477.5 |
| 8 | 0.044 | 8.230 | 0.028 | 0.464 | 0.0023 | 0.0034 | 14.191 | 563.8 |
| 16 | 0.048 | 21.037 | 0.028 | 0.455 | 0.0025 | 0.0041 | 27.236 | 587.5 |
the average latency of threads
![avg cost](../../../doc/criteo-cube-benchmark-avgcost.png)
The QPS is
![qps](../../../doc/criteo-cube-benchmark-qps.png)
## 带稀疏参数索引服务的CTR预测服务
(简体中文|[English](./README.md))
### 获取样例数据
进入目录 `python/examples/criteo_ctr_with_cube`
```
sh get_data.sh
```
### 下载模型和稀疏参数序列文件
```
wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz
tar xf ctr_cube_unittest.tar.gz
mv models/ctr_client_conf ./
mv models/ctr_serving_model_kv ./
mv models/data ./cube/
```
执行脚本后会在当前目录有ctr_server_model_kv和ctr_client_config文件夹。
### 启动稀疏参数索引服务
```
wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz
tar xf cube_app.tar.gz
mv cube_app/cube* ./cube/
sh cube_prepare.sh &
```
此处,模型当中的稀疏参数会被存放在稀疏参数索引服务Cube当中。
### 启动RPC预测服务,服务端线程数为4(可在test_server.py配置)
```
python test_server.py ctr_serving_model_kv
```
### 执行预测
```
python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
```
### Benchmark
设备 :Intel(R) Xeon(R) CPU 6148 @ 2.40GHz
模型 :[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/criteo_ctr_with_cube/network_conf.py)
server core/thread num : 4/8
执行
```
bash benchmark.sh
```
客户端每个线程会发送1000个batch
| client thread num | prepro | client infer | op0 | op1 | op2 | postpro | avg_latency | qps |
| ------------------ | ------ | ------------ | ------ | ----- | ------ | ------- | ----- | ----- |
| 1 | 0.035 | 1.596 | 0.021 | 0.518 | 0.0024 | 0.0025 | 6.774 | 147.7 |
| 2 | 0.034 | 1.780 | 0.027 | 0.463 | 0.0020 | 0.0023 | 6.931 | 288.3 |
| 4 | 0.038 | 2.954 | 0.025 | 0.455 | 0.0019 | 0.0027 | 8.378 | 477.5 |
| 8 | 0.044 | 8.230 | 0.028 | 0.464 | 0.0023 | 0.0034 | 14.191 | 563.8 |
| 16 | 0.048 | 21.037 | 0.028 | 0.455 | 0.0025 | 0.0041 | 27.236 | 587.5 |
平均每个线程耗时图如下
![avg cost](../../../doc/criteo-cube-benchmark-avgcost.png)
每个线程QPS耗时如下
![qps](../../../doc/criteo-cube-benchmark-qps.png)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
parser.add_argument(
'--train_data_path',
type=str,
default='./data/raw/train.txt',
help="The path of training dataset")
parser.add_argument(
'--sparse_only',
type=bool,
default=False,
help="Whether we use sparse features only")
parser.add_argument(
'--test_data_path',
type=str,
default='./data/raw/valid.txt',
help="The path of testing dataset")
parser.add_argument(
'--batch_size',
type=int,
default=1000,
help="The size of mini-batch (default:1000)")
parser.add_argument(
'--embedding_size',
type=int,
default=10,
help="The size for embedding layer (default:10)")
parser.add_argument(
'--num_passes',
type=int,
default=10,
help="The number of passes to train (default: 10)")
parser.add_argument(
'--model_output_dir',
type=str,
default='models',
help='The path for model to store (default: models)')
parser.add_argument(
'--sparse_feature_dim',
type=int,
default=1000001,
help='sparse feature hashing space for index processing')
parser.add_argument(
'--is_local',
type=int,
default=1,
help='Local train or distributed train (default: 1)')
parser.add_argument(
'--cloud_train',
type=int,
default=0,
help='Local train or distributed train on paddlecloud (default: 0)')
parser.add_argument(
'--async_mode',
action='store_true',
default=False,
help='Whether start pserver in async mode to support ASGD')
parser.add_argument(
'--no_split_var',
action='store_true',
default=False,
help='Whether split variables into blocks when update_method is pserver')
parser.add_argument(
'--role',
type=str,
default='pserver', # trainer or pserver
help='The path for model to store (default: models)')
parser.add_argument(
'--endpoints',
type=str,
default='127.0.0.1:6000',
help='The pserver endpoints, like: 127.0.0.1:6000,127.0.0.1:6001')
parser.add_argument(
'--current_endpoint',
type=str,
default='127.0.0.1:6000',
help='The path for model to store (default: 127.0.0.1:6000)')
parser.add_argument(
'--trainer_id',
type=int,
default=0,
help='The path for model to store (default: models)')
parser.add_argument(
'--trainers',
type=int,
default=1,
help='The num of trianers, (default: 1)')
return parser.parse_args()
# -*- coding: utf-8 -*-
#
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import Client
import sys
import os
import criteo as criteo
import time
from paddle_serving_client.utils import MultiThreadRunner
from paddle_serving_client.utils import benchmark_args
from paddle_serving_client.metric import auc
py_version = sys.version_info[0]
args = benchmark_args()
def single_func(idx, resource):
client = Client()
print([resource["endpoint"][idx % len(resource["endpoint"])]])
client.load_client_config('ctr_client_conf/serving_client_conf.prototxt')
client.connect(['127.0.0.1:9292'])
batch = 1
buf_size = 100
dataset = criteo.CriteoDataset()
dataset.setup(1000001)
test_filelists = [
"./raw_data/part-%d" % x for x in range(len(os.listdir("./raw_data")))
]
reader = dataset.infer_reader(test_filelists[len(test_filelists) - 40:],
batch, buf_size)
if args.request == "rpc":
fetch = ["prob"]
start = time.time()
itr = 1000
for ei in range(itr):
if args.batch_size > 0:
feed_batch = []
for bi in range(args.batch_size):
if py_version == 2:
data = reader().next()
else:
data = reader().__next__()
feed_dict = {}
feed_dict['dense_input'] = data[0][0]
for i in range(1, 27):
feed_dict["embedding_{}.tmp_0".format(i - 1)] = data[0][
i]
feed_batch.append(feed_dict)
result = client.predict(feed=feed_batch, fetch=fetch)
else:
print("unsupport batch size {}".format(args.batch_size))
elif args.request == "http":
raise ("Not support http service.")
end = time.time()
qps = itr * args.batch_size / (end - start)
return [[end - start, qps]]
if __name__ == '__main__':
multi_thread_runner = MultiThreadRunner()
endpoint_list = ["127.0.0.1:9292"]
#result = single_func(0, {"endpoint": endpoint_list})
start = time.time()
result = multi_thread_runner.run(single_func, args.thread,
{"endpoint": endpoint_list})
end = time.time()
total_cost = end - start
avg_cost = 0
qps = 0
for i in range(args.thread):
avg_cost += result[0][i * 2 + 0]
qps += result[0][i * 2 + 1]
avg_cost = avg_cost / args.thread
print("total cost: {}".format(total_cost))
print("average total cost {} s.".format(avg_cost))
print("qps {} ins/s".format(qps))
rm profile_log
export FLAGS_profile_client=1
export FLAGS_profile_server=1
wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz --no-check-certificate
tar xf ctr_cube_unittest.tar.gz
mv models/ctr_client_conf ./
mv models/ctr_serving_model_kv ./
mv models/data ./cube/
wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz --no-check-certificate
tar xf cube_app.tar.gz
mv cube_app/cube* ./cube/
sh cube_prepare.sh &
python test_server.py ctr_serving_model_kv > serving_log 2>&1 &
for thread_num in 1 4 16
do
for batch_size in 1 4 16 64
do
$PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1
echo "batch size : $batch_size"
echo "thread num : $thread_num"
echo "========================================"
echo "batch size : $batch_size" >> profile_log
$PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
tail -n 3 profile >> profile_log
done
done
ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
rm profile_log
#wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz --no-check-certificate
#tar xf ctr_cube_unittest.tar.gz
mv models/ctr_client_conf ./
mv models/ctr_serving_model_kv ./
mv models/data ./cube/
#wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz --no-check-certificate
#tar xf cube_app.tar.gz
mv cube_app/cube* ./cube/
sh cube_prepare.sh &
cp ../../../build_server/core/cube/cube-api/cube-cli .
python gen_key.py
for thread_num in 1 4 16 32
do
for batch_size in 1000
do
./cube-cli -config_file ./cube/conf/cube.conf -keys key -dict test_dict -thread_num $thread_num --batch $batch_size > profile 2>&1
echo "batch size : $batch_size"
echo "thread num : $thread_num"
echo "========================================"
echo "batch size : $batch_size" >> profile_log
echo "thread num : $thread_num" >> profile_log
tail -n 8 profile >> profile_log
done
done
ps -ef|grep 'cube'|grep -v grep|cut -c 9-15 | xargs kill -9
ps -ef | grep cube | awk {'print $2'} | xargs kill -9
rm -rf cube/cube_data cube/data cube/log* cube/nohup* cube/output/ cube/donefile cube/input cube/monitor cube/cube-builder.INFO
ps -ef | grep test | awk {'print $2'} | xargs kill -9
ps -ef | grep serving | awk {'print $2'} | xargs kill -9
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
class CriteoDataset(object):
def setup(self, sparse_feature_dim):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [
20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
]
self.cont_diff_ = [
20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
]
self.hash_dim_ = sparse_feature_dim
# here, training data are lines with line_index < train_idx_
self.train_idx_ = 41256555
self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40)
def _process_line(self, line):
features = line.rstrip('\n').split('\t')
dense_feature = []
sparse_feature = []
for idx in self.continuous_range_:
if features[idx] == '':
dense_feature.append(0.0)
else:
dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \
self.cont_diff_[idx - 1])
for idx in self.categorical_range_:
sparse_feature.append(
[hash(str(idx) + features[idx]) % self.hash_dim_])
return dense_feature, sparse_feature, [int(features[0])]
def infer_reader(self, filelist, batch, buf_size):
def local_iter():
for fname in filelist:
with open(fname.strip(), "r") as fin:
for line in fin:
dense_feature, sparse_feature, label = self._process_line(
line)
#yield dense_feature, sparse_feature, label
yield [dense_feature] + sparse_feature + [label]
import paddle
batch_iter = paddle.batch(
paddle.reader.shuffle(
local_iter, buf_size=buf_size),
batch_size=batch)
return batch_iter
def generate_sample(self, line):
def data_iter():
dense_feature, sparse_feature, label = self._process_line(line)
feature_name = ["dense_input"]
for idx in self.categorical_range_:
feature_name.append("C" + str(idx - 13))
feature_name.append("label")
yield zip(feature_name, [dense_feature] + sparse_feature + [label])
return data_iter
if __name__ == "__main__":
criteo_dataset = CriteoDataset()
criteo_dataset.setup(int(sys.argv[1]))
criteo_dataset.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import sys
import paddle.fluid.incubate.data_generator as dg
class CriteoDataset(dg.MultiSlotDataGenerator):
def setup(self, sparse_feature_dim):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [
20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
]
self.cont_diff_ = [
20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
]
self.hash_dim_ = sparse_feature_dim
# here, training data are lines with line_index < train_idx_
self.train_idx_ = 41256555
self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40)
def _process_line(self, line):
features = line.rstrip('\n').split('\t')
dense_feature = []
sparse_feature = []
for idx in self.continuous_range_:
if features[idx] == '':
dense_feature.append(0.0)
else:
dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \
self.cont_diff_[idx - 1])
for idx in self.categorical_range_:
sparse_feature.append(
[hash(str(idx) + features[idx]) % self.hash_dim_])
return dense_feature, sparse_feature, [int(features[0])]
def infer_reader(self, filelist, batch, buf_size):
def local_iter():
for fname in filelist:
with open(fname.strip(), "r") as fin:
for line in fin:
dense_feature, sparse_feature, label = self._process_line(
line)
#yield dense_feature, sparse_feature, label
yield [dense_feature] + sparse_feature + [label]
import paddle
batch_iter = paddle.batch(
paddle.reader.shuffle(
local_iter, buf_size=buf_size),
batch_size=batch)
return batch_iter
def generate_sample(self, line):
def data_iter():
dense_feature, sparse_feature, label = self._process_line(line)
feature_name = ["dense_input"]
for idx in self.categorical_range_:
feature_name.append("C" + str(idx - 13))
feature_name.append("label")
yield zip(feature_name, [dense_feature] + sparse_feature + [label])
return data_iter
if __name__ == "__main__":
criteo_dataset = CriteoDataset()
criteo_dataset.setup(int(sys.argv[1]))
criteo_dataset.run_from_stdin()
[{
"dict_name": "test_dict",
"shard": 1,
"dup": 1,
"timeout": 200,
"retry": 3,
"backup_request": 100,
"type": "ipport_list",
"load_balancer": "rr",
"nodes": [{
"ipport_list": "list://127.0.0.1:8027"
}]
}]
--port=8027
--dict_split=1
--in_mem=true
--log_dir=./log/
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
#! /bin/bash
mkdir -p cube_model
mkdir -p cube/data
./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1 -only_build=false
cd cube && ./cube
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
#! /bin/bash
mkdir -p cube_model
mkdir -p cube/data
./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature 8
./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1 -only_build=false
mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
cd cube && ./cube
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from __future__ import print_function
from args import parse_args
import os
import paddle.fluid as fluid
import sys
from network_conf import dnn_model
dense_feature_dim = 13
def train():
args = parse_args()
sparse_only = args.sparse_only
if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
dense_input = fluid.layers.data(
name="dense_input", shape=[dense_feature_dim], dtype='float32')
sparse_input_ids = [
fluid.layers.data(
name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
for i in range(1, 27)
]
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
#nn_input = None if sparse_only else dense_input
nn_input = dense_input
predict_y, loss, auc_var, batch_auc_var, infer_vars = dnn_model(
nn_input, sparse_input_ids, label, args.embedding_size,
args.sparse_feature_dim)
optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
optimizer.minimize(loss)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_use_var([dense_input] + sparse_input_ids + [label])
python_executable = "python"
pipe_command = "{} criteo_reader.py {}".format(python_executable,
args.sparse_feature_dim)
dataset.set_pipe_command(pipe_command)
dataset.set_batch_size(128)
thread_num = 10
dataset.set_thread(thread_num)
whole_filelist = [
"raw_data/part-%d" % x for x in range(len(os.listdir("raw_data")))
]
print(whole_filelist)
dataset.set_filelist(whole_filelist[:100])
dataset.load_into_memory()
fluid.layers.Print(auc_var)
epochs = 1
for i in range(epochs):
exe.train_from_dataset(
program=fluid.default_main_program(), dataset=dataset, debug=True)
print("epoch {} finished".format(i))
import paddle_serving_client.io as server_io
feed_var_dict = {}
feed_var_dict['dense_input'] = dense_input
for i, sparse in enumerate(sparse_input_ids):
feed_var_dict["embedding_{}.tmp_0".format(i)] = sparse
fetch_var_dict = {"prob": predict_y}
feed_kv_dict = {}
feed_kv_dict['dense_input'] = dense_input
for i, emb in enumerate(infer_vars):
feed_kv_dict["embedding_{}.tmp_0".format(i)] = emb
fetch_var_dict = {"prob": predict_y}
server_io.save_model("ctr_serving_model", "ctr_client_conf", feed_var_dict,
fetch_var_dict, fluid.default_main_program())
server_io.save_model("ctr_serving_model_kv", "ctr_client_conf_kv",
feed_kv_dict, fetch_var_dict,
fluid.default_main_program())
if __name__ == '__main__':
train()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import paddle.fluid as fluid
import math
def dnn_model(dense_input, sparse_inputs, label, embedding_size,
sparse_feature_dim):
def embedding_layer(input):
emb = fluid.layers.embedding(
input=input,
is_sparse=True,
is_distributed=False,
size=[sparse_feature_dim, embedding_size],
param_attr=fluid.ParamAttr(
name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()))
x = fluid.layers.sequence_pool(input=emb, pool_type='sum')
return emb, x
def mlp_input_tensor(emb_sums, dense_tensor):
#if isinstance(dense_tensor, fluid.Variable):
# return fluid.layers.concat(emb_sums, axis=1)
#else:
return fluid.layers.concat(emb_sums + [dense_tensor], axis=1)
def mlp(mlp_input):
fc1 = fluid.layers.fc(input=mlp_input,
size=400,
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(mlp_input.shape[1]))))
fc2 = fluid.layers.fc(input=fc1,
size=400,
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(fc1.shape[1]))))
fc3 = fluid.layers.fc(input=fc2,
size=400,
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(fc2.shape[1]))))
pre = fluid.layers.fc(input=fc3,
size=2,
act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(fc3.shape[1]))))
return pre
emb_pair_sums = list(map(embedding_layer, sparse_inputs))
emb_sums = [x[1] for x in emb_pair_sums]
infer_vars = [x[0] for x in emb_pair_sums]
mlp_in = mlp_input_tensor(emb_sums, dense_input)
predict = mlp(mlp_in)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.reduce_sum(cost)
accuracy = fluid.layers.accuracy(input=predict, label=label)
auc_var, batch_auc_var, auc_states = \
fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
return predict, avg_cost, auc_var, batch_auc_var, infer_vars
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import os
import sys
from paddle_serving_server import OpMaker
from paddle_serving_server import OpSeqMaker
from paddle_serving_server import Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
general_dist_kv_infer_op = op_maker.create('general_dist_kv_infer')
response_op = op_maker.create('general_response')
op_seq_maker = OpSeqMaker()
op_seq_maker.add_op(read_op)
op_seq_maker.add_op(general_dist_kv_infer_op)
op_seq_maker.add_op(response_op)
server = Server()
server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(4)
server.load_model_config(sys.argv[1])
server.prepare_server(
workdir="work_dir1",
port=9292,
device="cpu",
cube_conf="./cube/conf/cube.conf")
server.run_server()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import os
import sys
from paddle_serving_server_gpu import OpMaker
from paddle_serving_server_gpu import OpSeqMaker
from paddle_serving_server_gpu import Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
general_dist_kv_infer_op = op_maker.create('general_dist_kv_infer')
response_op = op_maker.create('general_response')
op_seq_maker = OpSeqMaker()
op_seq_maker.add_op(read_op)
op_seq_maker.add_op(general_dist_kv_infer_op)
op_seq_maker.add_op(response_op)
server = Server()
server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(4)
server.load_model_config(sys.argv[1])
server.prepare_server(
workdir="work_dir1",
port=9292,
device="cpu",
cube_conf="./cube/conf/cube.conf")
server.run_server()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import os
import sys
from paddle_serving_server import OpMaker
from paddle_serving_server import OpSeqMaker
from paddle_serving_server import Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
general_dist_kv_infer_op = op_maker.create('general_dist_kv_quant_infer')
response_op = op_maker.create('general_response')
op_seq_maker = OpSeqMaker()
op_seq_maker.add_op(read_op)
op_seq_maker.add_op(general_dist_kv_infer_op)
op_seq_maker.add_op(response_op)
server = Server()
server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(4)
server.load_model_config(sys.argv[1])
server.prepare_server(
workdir="work_dir1",
port=9292,
device="cpu",
cube_conf="./cube/conf/cube.conf")
server.run_server()
# Encryption Model Prediction
([简体中文](README_CN.md)|English)
## Get Origin Model
The example uses the model file of the fit_a_line example as a origin model
```
sh get_data.sh
```
## Encrypt Model
```
python encrypt.py
```
The key is stored in the `key` file, and the encrypted model file and server-side configuration file are stored in the `encrypt_server` directory.
client-side configuration file are stored in the `encrypt_client` directory.
## Start Encryption Service
CPU Service
```
python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model
```
GPU Service
```
python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
```
## Prediction
```
python test_client.py uci_housing_client/serving_client_conf.prototxt
```
# 加密模型预测
(简体中文|[English](README.md))
## 获取明文模型
示例中使用fit_a_line示例的模型文件作为明文模型
```
sh get_data.sh
```
## 模型加密
```
python encrypt.py
```
密钥保存在`key`文件中,加密模型文件以及server端配置文件保存在`encrypt_server`目录下,client端配置文件保存在`encrypt_client`目录下。
## 启动加密预测服务
CPU预测服务
```
python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model
```
GPU预测服务
```
python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
```
## 预测
```
python test_client.py uci_housing_client/serving_client_conf.prototxt
```
......@@ -12,9 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import random
from paddle_serving_client.io import inference_model_to_serving
with open("key", "w") as f:
for i in range(1000000):
f.write("{}\n".format(random.randint(0, 999999)))
def serving_encryption():
inference_model_to_serving(
dirname="./uci_housing_model",
serving_server="encrypt_server",
serving_client="encrypt_client",
encryption=True)
if __name__ == "__main__":
serving_encryption()
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/data/ctr_prediction/ctr_data.tar.gz
tar -zxvf ctr_data.tar.gz
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing_example/encrypt.tar.gz
tar -xzf encrypt.tar.gz
cp -rvf ../fit_a_line/uci_housing_model .
cp -rvf ../fit_a_line/uci_housing_client .
......@@ -13,19 +13,20 @@
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
import numpy as np
from paddle_serving_client import Client
import sys
client = Client()
client.connect(["127.0.0.1:9393"])
client.load_client_config(sys.argv[1])
client.use_key("./key")
client.connect(["127.0.0.1:9300"], encryption=True)
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
for i in range(3):
fetch_map = client.predict(feed={"x": np.array(x)}, fetch=["price"])
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
else:
print(fetch_map["serving_status_code"])
import paddle
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.test(), buf_size=500),
batch_size=1)
for data in test_reader():
fetch_map = client.predict(feed={"x": data[0][0]}, fetch=["price"])
print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
......@@ -14,12 +14,6 @@ sh get_data.sh
### 开启服务端
``` shell
python test_server.py uci_housing_model/
```
也可以通过下面的一行代码开启默认RPC服务:
```shell
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
```
......
......@@ -16,7 +16,7 @@
import sys
import paddle
import paddle.fluid as fluid
paddle.enable_static()
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.train(), buf_size=500),
......
......@@ -31,6 +31,6 @@ class UciService(WebService):
uci_service = UciService(name="uci")
uci_service.load_model_config("uci_housing_model")
uci_service.prepare_server(workdir="workdir", port=9292)
uci_service.prepare_server(workdir="workdir", port=9393)
uci_service.run_rpc_service()
uci_service.run_web_service()
......@@ -38,20 +38,9 @@ python test_asyn_client.py
python test_batch_client.py
```
### 通用 pb 预测
``` shell
python test_general_pb_client.py
```
### 预测超时
``` shell
python test_timeout_client.py
```
### List 输入
``` shell
python test_list_input_client.py
```
......@@ -18,7 +18,7 @@ import functools
import time
import threading
import grpc
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
......@@ -43,7 +43,8 @@ x = [
]
task_count = 0
for i in range(3):
future = client.predict(feed={"x": x}, fetch=["price"], asyn=True)
new_data = np.array(x).astype("float32").reshape((1,13))
future = client.predict(feed={"x": new_data}, fetch=["price"], batch=False, asyn=True)
task_count += 1
future.add_done_callback(functools.partial(call_back))
......
......@@ -13,7 +13,7 @@
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
......@@ -24,8 +24,11 @@ x = [
]
for i in range(3):
batch_feed = [{"x": x} for j in range(batch_size)]
fetch_map = client.predict(feed=batch_feed, fetch=["price"])
new_data = np.array(x).astype("float32").reshape((1, 1, 13))
batch_data = np.concatenate([new_data, new_data, new_data], axis=0)
print(batch_data.shape)
fetch_map = client.predict(feed={"x":batch_data}, fetch=["price"], batch=True)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
else:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
client = Client()
client.connect(["127.0.0.1:9393"])
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
for i in range(3):
fetch_map = client.predict(feed={"x": x}, fetch=["price"], is_python=False)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
else:
print(fetch_map["serving_status_code"])
......@@ -14,16 +14,27 @@
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
"""
for data in test_reader():
new_data = np.zeros((1, 1, 13)).astype("float32")
new_data[0] = data[0][0]
fetch_map = client.predict(
feed={"x": new_data}, fetch=["price"], batch=True)
print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
print(fetch_map)
"""
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
for i in range(3):
fetch_map = client.predict(feed={"x": x}, fetch=["price"])
new_data = np.array(x).astype("float32").reshape((1,13))
fetch_map = client.predict(feed={"x": new_data}, fetch=["price"], batch=False)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
else:
......
......@@ -15,17 +15,18 @@
from paddle_serving_client import MultiLangClient as Client
import grpc
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
client.set_rpc_timeout_ms(1)
client.set_rpc_timeout_ms(40)
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
for i in range(3):
fetch_map = client.predict(feed={"x": x}, fetch=["price"])
new_data = np.array(x).astype("float32").reshape((1,13))
fetch_map = client.predict(feed={"x": new_data}, fetch=["price"], batch=False)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
elif fetch_map["serving_status_code"] == grpc.StatusCode.DEADLINE_EXCEEDED:
......
......@@ -27,7 +27,7 @@ preprocess = Sequential([
postprocess = RCNNPostprocess("label_list.txt", "output", [608, 608])
client = Client()
client.connect(['127.0.0.1:9393'])
# client.set_rpc_timeout_ms(10000)
client.set_rpc_timeout_ms(15000)
im = preprocess(sys.argv[1])
fetch_map = client.predict(
......@@ -35,7 +35,8 @@ fetch_map = client.predict(
"image": im,
"im_size": np.array(list(im.shape[1:])),
},
fetch=["save_infer_model/scale_0.tmp_0"])
fetch=["save_infer_model/scale_0.tmp_0"], batch=False)
print(fetch_map)
fetch_map.pop("serving_status_code")
fetch_map["image"] = sys.argv[1]
postprocess(fetch_map)
......@@ -3,6 +3,7 @@
worker_num: 1
#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时,不自动生成http_port
rpc_port: 9998
http_port: 18082
dag:
......@@ -20,7 +21,7 @@ op:
model_config: uci_housing_model
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0" # "0,1"
devices: "" # "0,1"
#client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测
client_type: local_predictor
......
......@@ -11,49 +11,50 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import Client
import sys
import os
import criteo as criteo
import time
from paddle_serving_client.metric import auc
try:
from paddle_serving_server.web_service import WebService, Op
except ImportError:
from paddle_serving_server.web_service import WebService, Op
import logging
import numpy as np
from numpy import array
import sys
import base64
_LOGGER = logging.getLogger()
np.set_printoptions(threshold=sys.maxsize)
class UciOp(Op):
def init_op(self):
self.separator = ","
def preprocess(self, input_dicts, data_id, log_id):
"""
diff with web_server.py
javaclient input type is INDArray, restful request input is list.
this function simply reshape input to the Specified shape.
"""
(_, input_dict), = input_dicts.items()
_LOGGER.error("UciOp::preprocess >>> log_id:{}, input:{}".format(
log_id, input_dict))
proc_dict = {}
x_value = input_dict["x"]
input_dict["x"] = x_value.reshape(1,13)
return input_dict, False, None, ""
def postprocess(self, input_dicts, fetch_dict, log_id):
_LOGGER.info("UciOp::postprocess >>> log_id:{}, fetch_dict:{}".format(
log_id, fetch_dict))
fetch_dict["price"] = str(fetch_dict["price"][0][0])
return fetch_dict, None, ""
class UciService(WebService):
def get_pipeline_response(self, read_op):
uci_op = UciOp(name="uci", input_ops=[read_op])
return uci_op
py_version = sys.version_info[0]
client = Client()
client.load_client_config(sys.argv[1])
client.connect(["127.0.0.1:9292"])
batch = 1
buf_size = 100
dataset = criteo.CriteoDataset()
dataset.setup(1000001)
test_filelists = ["{}/part-0".format(sys.argv[2])]
reader = dataset.infer_reader(test_filelists, batch, buf_size)
label_list = []
prob_list = []
start = time.time()
for ei in range(10000):
if py_version == 2:
data = reader().next()
else:
data = reader().__next__()
feed_dict = {}
feed_dict['dense_input'] = np.array(data[0][0]).astype("float32").reshape(
1, 13)
feed_dict['dense_input.lod'] = [0, 1]
for i in range(1, 27):
tmp_data = np.array(data[0][i]).astype(np.int64)
feed_dict["embedding_{}.tmp_0".format(i - 1)] = tmp_data.reshape(
(1, len(data[0][i])))
feed_dict["embedding_{}.tmp_0.lod".format(i - 1)] = [0, 1]
fetch_map = client.predict(feed=feed_dict, fetch=["prob"], batch=True)
prob_list.append(fetch_map['prob'][0][1])
label_list.append(data[0][-1][0])
print(auc(label_list, prob_list))
end = time.time()
print(end - start)
uci_service = UciService(name="uci")
uci_service.prepare_pipeline_config("config.yml")
uci_service.run_service()
......@@ -57,6 +57,8 @@ class LocalPredictor(object):
mem_optim=True,
ir_optim=False,
use_trt=False,
use_lite=False,
use_xpu=False,
use_feed_fetch_ops=False):
"""
Load model config and set the engine config for the paddle predictor
......@@ -70,6 +72,8 @@ class LocalPredictor(object):
mem_optim: memory optimization, True default.
ir_optim: open calculation chart optimization, False default.
use_trt: use nvidia TensorRT optimization, False default
use_lite: use Paddle-Lite Engint, False default
use_xpu: run predict on Baidu Kunlun, False default
use_feed_fetch_ops: use feed/fetch ops, False default.
"""
client_config = "{}/serving_server_conf.prototxt".format(model_path)
......@@ -80,9 +84,9 @@ class LocalPredictor(object):
config = AnalysisConfig(model_path)
logger.info("load_model_config params: model_path:{}, use_gpu:{},\
gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
use_trt:{}, use_feed_fetch_ops:{}".format(
use_trt:{}, use_lite:{}, use_xpu: {}, use_feed_fetch_ops:{}".format(
model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
ir_optim, use_trt, use_feed_fetch_ops))
ir_optim, use_trt, use_lite, use_xpu, use_feed_fetch_ops))
self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
......@@ -119,6 +123,17 @@ class LocalPredictor(object):
use_static=False,
use_calib_mode=False)
if use_lite:
config.enable_lite_engine(
precision_mode = PrecisionType.Float32,
zero_copy = True,
passes_filter = [],
ops_filter = []
)
if use_xpu:
config.enable_xpu(100 * 1024 * 1024)
self.predictor = create_paddle_predictor(config)
def predict(self, feed=None, fetch=None, batch=False, log_id=0):
......
......@@ -19,6 +19,9 @@ from .proto import sdk_configure_pb2 as sdk
from .proto import general_model_config_pb2 as m_config
import google.protobuf.text_format
import numpy as np
import requests
import json
import base64
import time
import sys
......@@ -161,6 +164,7 @@ class Client(object):
self.fetch_names_to_idx_ = {}
self.lod_tensor_set = set()
self.feed_tensor_len = {}
self.key = None
for i, var in enumerate(model_conf.feed_var):
self.feed_names_to_idx_[var.alias_name] = i
......@@ -193,7 +197,28 @@ class Client(object):
else:
self.rpc_timeout_ms = rpc_timeout
def connect(self, endpoints=None):
def use_key(self, key_filename):
with open(key_filename, "r") as f:
self.key = f.read()
def get_serving_port(self, endpoints):
if self.key is not None:
req = json.dumps({"key": base64.b64encode(self.key)})
else:
req = json.dumps({})
r = requests.post("http://" + endpoints[0], req)
result = r.json()
print(result)
if "endpoint_list" not in result:
raise ValueError("server not ready")
else:
endpoints = [
endpoints[0].split(":")[0] + ":" +
str(result["endpoint_list"][0])
]
return endpoints
def connect(self, endpoints=None, encryption=False):
# check whether current endpoint is available
# init from client config
# create predictor here
......@@ -203,6 +228,8 @@ class Client(object):
"You must set the endpoints parameter or use add_variant function to create a variant."
)
else:
if encryption:
endpoints = self.get_serving_port(endpoints)
if self.predictor_sdk_ is None:
self.add_variant('default_tag_{}'.format(id(self)), endpoints,
100)
......@@ -522,20 +549,15 @@ class MultiLangClient(object):
req.fetch_var_names.extend(fetch)
req.is_python = is_python
req.log_id = log_id
feed_batch = None
if isinstance(feed, dict):
feed_batch = [feed]
elif isinstance(feed, list):
feed_batch = feed
else:
raise Exception("{} not support".format(type(feed)))
req.feed_var_names.extend(feed_batch[0].keys())
init_feed_names = False
for feed_data in feed_batch:
feed_var_names = []
for key in feed.keys():
if '.lod' not in key:
feed_var_names.append(key)
req.feed_var_names.extend(feed_var_names)
inst = multi_lang_general_model_service_pb2.FeedInst()
for name in req.feed_var_names:
tensor = multi_lang_general_model_service_pb2.Tensor()
var = feed_data[name]
var = feed[name]
v_type = self.feed_types_[name]
if is_python:
data = None
......@@ -564,34 +586,9 @@ class MultiLangClient(object):
else:
raise Exception("var must be list or ndarray.")
tensor.data = data.tobytes()
else:
if isinstance(var, np.ndarray):
if v_type == 0: # int64
tensor.int64_data.extend(
var.reshape(-1).astype("int64").tolist())
elif v_type == 1:
tensor.float_data.extend(
var.reshape(-1).astype('float32').tolist())
elif v_type == 2:
tensor.int_data.extend(
var.reshape(-1).astype('int32').tolist())
else:
raise Exception("error tensor value type.")
elif isinstance(var, list):
if v_type == 0:
tensor.int64_data.extend(self._flatten_list(var))
elif v_type == 1:
tensor.float_data.extend(self._flatten_list(var))
elif v_type == 2:
tensor.int_data.extend(self._flatten_list(var))
else:
raise Exception("error tensor value type.")
else:
raise Exception("var must be list or ndarray.")
if isinstance(var, np.ndarray):
tensor.shape.extend(list(var.shape))
else:
tensor.shape.extend(self.feed_shapes_[name])
if "{}.lod".format(name) in feed.keys():
tensor.lod.extend(feed["{}.lod".format(name)])
inst.tensor_array.append(tensor)
req.insts.append(inst)
return req
......@@ -652,10 +649,17 @@ class MultiLangClient(object):
def predict(self,
feed,
fetch,
batch=True,
need_variant_tag=False,
asyn=False,
is_python=True,
log_id=0):
if isinstance(feed, dict) is False:
raise ValueError("Type Error. grpc feed must be dict.")
if batch is False:
for key in feed:
if ".lod" not in key:
feed[key] = feed[key][np.newaxis, :]
if not asyn:
try:
self.profile_.record('py_prepro_0')
......
......@@ -21,15 +21,105 @@ from paddle.fluid.framework import Program
from paddle.fluid import CPUPlace
from paddle.fluid.io import save_inference_model
import paddle.fluid as fluid
from paddle.fluid.core import CipherUtils
from paddle.fluid.core import CipherFactory
from paddle.fluid.core import Cipher
from ..proto import general_model_config_pb2 as model_conf
import os
import paddle
import paddle.nn.functional as F
import errno
from paddle.jit import to_static
def save_dygraph_model(serving_model_folder, client_config_folder, model):
paddle.jit.save(model, "serving_tmp")
loaded_layer = paddle.jit.load(path=".", model_filename="serving_tmp.pdmodel", params_filename="serving_tmp.pdiparams")
feed_target_names = [x.name for x in loaded_layer._input_spec()]
fetch_target_names = [x.name for x in loaded_layer._output_spec()]
inference_program = loaded_layer.program()
feed_var_dict = {
x: inference_program.global_block().var(x)
for x in feed_target_names
}
fetch_var_dict = {
x: inference_program.global_block().var(x)
for x in fetch_target_names
}
config = model_conf.GeneralModelConfig()
#int64 = 0; float32 = 1; int32 = 2;
for key in feed_var_dict:
feed_var = model_conf.FeedVar()
feed_var.alias_name = key
feed_var.name = feed_var_dict[key].name
feed_var.is_lod_tensor = feed_var_dict[key].lod_level >= 1
if feed_var_dict[key].dtype == core.VarDesc.VarType.INT64:
feed_var.feed_type = 0
if feed_var_dict[key].dtype == core.VarDesc.VarType.FP32:
feed_var.feed_type = 1
if feed_var_dict[key].dtype == core.VarDesc.VarType.INT32:
feed_var.feed_type = 2
if feed_var.is_lod_tensor:
feed_var.shape.extend([-1])
else:
tmp_shape = []
for v in feed_var_dict[key].shape:
if v >= 0:
tmp_shape.append(v)
feed_var.shape.extend(tmp_shape)
config.feed_var.extend([feed_var])
for key in fetch_var_dict:
fetch_var = model_conf.FetchVar()
fetch_var.alias_name = key
fetch_var.name = fetch_var_dict[key].name
fetch_var.is_lod_tensor = 1
if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT64:
fetch_var.fetch_type = 0
if fetch_var_dict[key].dtype == core.VarDesc.VarType.FP32:
fetch_var.fetch_type = 1
if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT32:
fetch_var.fetch_type = 2
if fetch_var.is_lod_tensor:
fetch_var.shape.extend([-1])
else:
tmp_shape = []
for v in fetch_var_dict[key].shape:
if v >= 0:
tmp_shape.append(v)
fetch_var.shape.extend(tmp_shape)
config.fetch_var.extend([fetch_var])
cmd = "mkdir -p {}".format(client_config_folder)
os.system(cmd)
cmd = "mkdir -p {}".format(serving_model_folder)
os.system(cmd)
cmd = "mv {} {}/__model__".format("serving_tmp.pdmodel", serving_model_folder)
os.system(cmd)
cmd = "mv {} {}/__params__".format("serving_tmp.pdiparams", serving_model_folder)
os.system(cmd)
cmd = "rm -rf serving_tmp.pd*"
os.system(cmd)
with open("{}/serving_client_conf.prototxt".format(client_config_folder),
"w") as fout:
fout.write(str(config))
with open("{}/serving_server_conf.prototxt".format(serving_model_folder),
"w") as fout:
fout.write(str(config))
with open("{}/serving_client_conf.stream.prototxt".format(
client_config_folder), "wb") as fout:
fout.write(config.SerializeToString())
with open("{}/serving_server_conf.stream.prototxt".format(
serving_model_folder), "wb") as fout:
fout.write(config.SerializeToString())
def save_model(server_model_folder,
client_config_folder,
feed_var_dict,
fetch_var_dict,
main_program=None):
main_program=None,
encryption=False,
key_len=128,
encrypt_conf=None):
executor = Executor(place=CPUPlace())
feed_var_names = [feed_var_dict[x].name for x in feed_var_dict]
......@@ -39,12 +129,31 @@ def save_model(server_model_folder,
target_vars.append(fetch_var_dict[key])
target_var_names.append(key)
if not encryption:
save_inference_model(
server_model_folder,
feed_var_names,
target_vars,
executor,
model_filename="__model__",
params_filename="__params__",
main_program=main_program)
else:
if encrypt_conf == None:
aes_cipher = CipherFactory.create_cipher()
else:
#todo: more encryption algorithms
pass
key = CipherUtils.gen_key_to_file(128, "key")
params = fluid.io.save_persistables(
executor=executor, dirname=None, main_program=main_program)
model = main_program.desc.serialize_to_string()
if not os.path.exists(server_model_folder):
os.makedirs(server_model_folder)
os.chdir(server_model_folder)
aes_cipher.encrypt_to_file(params, key, "encrypt_params")
aes_cipher.encrypt_to_file(model, key, "encrypt_model")
os.chdir("..")
config = model_conf.GeneralModelConfig()
......@@ -116,7 +225,11 @@ def inference_model_to_serving(dirname,
serving_server="serving_server",
serving_client="serving_client",
model_filename=None,
params_filename=None):
params_filename=None,
encryption=False,
key_len=128,
encrypt_conf=None):
paddle.enable_static()
place = fluid.CPUPlace()
exe = fluid.Executor(place)
inference_program, feed_target_names, fetch_targets = \
......@@ -127,7 +240,7 @@ def inference_model_to_serving(dirname,
}
fetch_dict = {x.name: x for x in fetch_targets}
save_model(serving_server, serving_client, feed_dict, fetch_dict,
inference_program)
inference_program, encryption, key_len, encrypt_conf)
feed_names = feed_dict.keys()
fetch_names = fetch_dict.keys()
return feed_names, fetch_names
......@@ -157,6 +157,7 @@ class Server(object):
self.cur_path = os.getcwd()
self.use_local_bin = False
self.mkl_flag = False
self.encryption_model = False
self.product_name = None
self.container_id = None
self.model_config_paths = None # for multi-model in a workflow
......@@ -196,6 +197,8 @@ class Server(object):
def set_ir_optimize(self, flag=False):
self.ir_optimization = flag
def use_encryption_model(self, flag=False):
self.encryption_model = flag
def set_product_name(self, product_name=None):
if product_name == None:
......@@ -230,11 +233,21 @@ class Server(object):
engine.enable_ir_optimization = self.ir_optimization
engine.static_optimization = False
engine.force_update_static_cache = False
if os.path.exists('{}/__params__'.format(model_config_path)):
suffix = ""
else:
suffix = "_DIR"
if device == "cpu":
engine.type = "FLUID_CPU_ANALYSIS_DIR"
if self.encryption_model:
engine.type = "FLUID_CPU_ANALYSIS_ENCRYPT"
else:
engine.type = "FLUID_CPU_ANALYSIS" + suffix
elif device == "gpu":
engine.type = "FLUID_GPU_ANALYSIS_DIR"
if self.encryption_model:
engine.type = "FLUID_GPU_ANALYSIS_ENCRYPT"
else:
engine.type = "FLUID_GPU_ANALYSIS" + suffix
self.model_toolkit_conf.engines.extend([engine])
......@@ -523,9 +536,8 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
fetch_names = list(request.fetch_var_names)
is_python = request.is_python
log_id = request.log_id
feed_batch = []
for feed_inst in request.insts:
feed_dict = {}
feed_inst = request.insts[0]
for idx, name in enumerate(feed_names):
var = feed_inst.tensor_array[idx]
v_type = self.feed_types_[name]
......@@ -539,19 +551,11 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
data = np.frombuffer(var.data, dtype="int32")
else:
raise Exception("error type.")
else:
if v_type == 0: # int64
data = np.array(list(var.int64_data), dtype="int64")
elif v_type == 1: # float32
data = np.array(list(var.float_data), dtype="float32")
elif v_type == 2: # int32
data = np.array(list(var.int_data), dtype="int32")
else:
raise Exception("error type.")
data.shape = list(feed_inst.tensor_array[idx].shape)
feed_dict[name] = data
feed_batch.append(feed_dict)
return feed_batch, fetch_names, is_python, log_id
if len(var.lod) > 0:
feed_dict["{}.lod".format()] = var.lod
return feed_dict, fetch_names, is_python, log_id
def _pack_inference_response(self, ret, fetch_names, is_python):
resp = multi_lang_general_model_service_pb2.InferenceResponse()
......@@ -608,6 +612,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
ret = self.bclient_.predict(
feed=feed_dict,
fetch=fetch_names,
batch=True,
need_variant_tag=True,
log_id=log_id)
return self._pack_inference_response(ret, fetch_names, is_python)
......
......@@ -18,8 +18,14 @@ Usage:
python -m paddle_serving_server.serve --model ./serving_server_model --port 9292
"""
import argparse
from .web_service import WebService
import sys
import json
import base64
import time
from multiprocessing import Process
from web_service import WebService, port_is_available
from flask import Flask, request
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
def parse_args(): # pylint: disable=doc-string-missing
......@@ -53,6 +59,11 @@ def parse_args(): # pylint: disable=doc-string-missing
type=int,
default=512 * 1024 * 1024,
help="Limit sizes of messages")
parser.add_argument(
"--use_encryption_model",
default=False,
action="store_true",
help="Use encryption model")
parser.add_argument(
"--use_multilang",
default=False,
......@@ -71,17 +82,18 @@ def parse_args(): # pylint: disable=doc-string-missing
return parser.parse_args()
def start_standard_model(): # pylint: disable=doc-string-missing
def start_standard_model(serving_port): # pylint: disable=doc-string-missing
args = parse_args()
thread_num = args.thread
model = args.model
port = args.port
port = serving_port
workdir = args.workdir
device = args.device
mem_optim = args.mem_optim_off is False
ir_optim = args.ir_optim
max_body_size = args.max_body_size
use_mkl = args.use_mkl
use_encryption_model = args.use_encryption_model
use_multilang = args.use_multilang
if model == "":
......@@ -111,6 +123,7 @@ def start_standard_model(): # pylint: disable=doc-string-missing
server.use_mkl(use_mkl)
server.set_max_body_size(max_body_size)
server.set_port(port)
server.use_encryption_model(use_encryption_model)
if args.product_name != None:
server.set_product_name(args.product_name)
if args.container_id != None:
......@@ -120,12 +133,88 @@ def start_standard_model(): # pylint: disable=doc-string-missing
server.prepare_server(workdir=workdir, port=port, device=device)
server.run_server()
class MainService(BaseHTTPRequestHandler):
def get_available_port(self):
default_port = 12000
for i in range(1000):
if port_is_available(default_port + i):
return default_port + i
def start_serving(self):
start_standard_model(serving_port)
def get_key(self, post_data):
if "key" not in post_data:
return False
else:
key = base64.b64decode(post_data["key"])
with open(args.model + "/key", "w") as f:
f.write(key)
return True
def check_key(self, post_data):
if "key" not in post_data:
return False
else:
key = base64.b64decode(post_data["key"])
with open(args.model + "/key", "r") as f:
cur_key = f.read()
return (key == cur_key)
def start(self, post_data):
post_data = json.loads(post_data)
global p_flag
if not p_flag:
if args.use_encryption_model:
print("waiting key for model")
if not self.get_key(post_data):
print("not found key in request")
return False
global serving_port
global p
serving_port = self.get_available_port()
p = Process(target=self.start_serving)
p.start()
time.sleep(3)
if p.is_alive():
p_flag = True
else:
return False
else:
if p.is_alive():
if not self.check_key(post_data):
return False
else:
return False
return True
def do_POST(self):
content_length = int(self.headers['Content-Length'])
post_data = self.rfile.read(content_length)
if self.start(post_data):
response = {"endpoint_list": [serving_port]}
else:
response = {"message": "start serving failed"}
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
self.wfile.write(json.dumps(response))
if __name__ == "__main__":
args = parse_args()
if args.name == "None":
start_standard_model()
if args.use_encryption_model:
p_flag = False
p = None
serving_port = 0
server = HTTPServer(('localhost', int(args.port)), MainService)
print(
'Starting encryption server, waiting for key from client, use <Ctrl-C> to stop'
)
server.serve_forever()
else:
start_standard_model(args.port)
else:
service = WebService(name=args.name)
service.load_model_config(args.model)
......
......@@ -25,6 +25,16 @@ from paddle_serving_server import pipeline
from paddle_serving_server.pipeline import Op
def port_is_available(port):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
sock.settimeout(2)
result = sock.connect_ex(('0.0.0.0', port))
if result != 0:
return True
else:
return False
class WebService(object):
def __init__(self, name="default_service"):
self.name = name
......@@ -110,7 +120,7 @@ class WebService(object):
self.mem_optim = mem_optim
self.ir_optim = ir_optim
for i in range(1000):
if self.port_is_available(default_port + i):
if port_is_available(default_port + i):
self.port_list.append(default_port + i)
break
......
......@@ -70,6 +70,11 @@ def serve_args():
type=int,
default=512 * 1024 * 1024,
help="Limit sizes of messages")
parser.add_argument(
"--use_encryption_model",
default=False,
action="store_true",
help="Use encryption model")
parser.add_argument(
"--use_multilang",
default=False,
......@@ -77,6 +82,10 @@ def serve_args():
help="Use Multi-language-service")
parser.add_argument(
"--use_trt", default=False, action="store_true", help="Use TensorRT")
parser.add_argument(
"--use_lite", default=False, action="store_true", help="Use PaddleLite")
parser.add_argument(
"--use_xpu", default=False, action="store_true", help="Use XPU")
parser.add_argument(
"--product_name",
type=str,
......@@ -210,6 +219,8 @@ class Server(object):
self.use_local_bin = False
self.gpuid = 0
self.use_trt = False
self.use_lite = False
self.use_xpu = False
self.model_config_paths = None # for multi-model in a workflow
self.product_name = None
self.container_id = None
......@@ -279,7 +290,13 @@ class Server(object):
def set_trt(self):
self.use_trt = True
def _prepare_engine(self, model_config_paths, device):
def set_lite(self):
self.use_lite = True
def set_xpu(self):
self.use_xpu = True
def _prepare_engine(self, model_config_paths, device, use_encryption_model):
if self.model_toolkit_conf == None:
self.model_toolkit_conf = server_sdk.ModelToolkitConf()
......@@ -299,11 +316,23 @@ class Server(object):
engine.static_optimization = False
engine.force_update_static_cache = False
engine.use_trt = self.use_trt
engine.use_lite = self.use_lite
engine.use_xpu = self.use_xpu
if device == "cpu":
if use_encryption_model:
engine.type = "FLUID_CPU_ANALYSIS_ENCRPT"
else:
engine.type = "FLUID_CPU_ANALYSIS_DIR"
elif device == "gpu":
if use_encryption_model:
engine.type = "FLUID_GPU_ANALYSIS_ENCRPT"
else:
engine.type = "FLUID_GPU_ANALYSIS_DIR"
elif device == "arm":
engine.type = "FLUID_ARM_ANALYSIS_DIR"
self.model_toolkit_conf.engines.extend([engine])
......@@ -405,10 +434,12 @@ class Server(object):
for line in version_file.readlines():
if re.match("cuda_version", line):
cuda_version = line.split("\"")[1]
if cuda_version != "trt":
device_version = "serving-gpu-cuda" + cuda_version + "-"
else:
if cuda_version == "trt":
device_version = "serving-gpu-" + cuda_version + "-"
elif cuda_version == "arm":
device_version = "serving-" + cuda_version + "-"
else:
device_version = "serving-gpu-cuda" + cuda_version + "-"
folder_name = device_version + serving_server_version
tar_name = folder_name + ".tar.gz"
......@@ -460,6 +491,7 @@ class Server(object):
workdir=None,
port=9292,
device="cpu",
use_encryption_model=False,
cube_conf=None):
if workdir == None:
workdir = "./tmp"
......@@ -473,7 +505,8 @@ class Server(object):
self.set_port(port)
self._prepare_resource(workdir, cube_conf)
self._prepare_engine(self.model_config_paths, device)
self._prepare_engine(self.model_config_paths, device,
use_encryption_model)
self._prepare_infer_service(port)
self.workdir = workdir
......@@ -507,7 +540,36 @@ class Server(object):
time.sleep(1)
else:
print("Use local bin : {}".format(self.bin_path))
self.check_cuda()
#self.check_cuda()
if self.use_lite:
command = "{} " \
"-enable_model_toolkit " \
"-inferservice_path {} " \
"-inferservice_file {} " \
"-max_concurrency {} " \
"-num_threads {} " \
"-port {} " \
"-reload_interval_s {} " \
"-resource_path {} " \
"-resource_file {} " \
"-workflow_path {} " \
"-workflow_file {} " \
"-bthread_concurrency {} " \
"-max_body_size {} ".format(
self.bin_path,
self.workdir,
self.infer_service_fn,
self.max_concurrency,
self.num_threads,
self.port,
self.reload_interval_s,
self.workdir,
self.resource_fn,
self.workdir,
self.workflow_fn,
self.num_threads,
self.max_body_size)
else:
command = "{} " \
"-enable_model_toolkit " \
"-inferservice_path {} " \
......
......@@ -19,25 +19,29 @@ Usage:
"""
import argparse
import os
import json
import base64
from multiprocessing import Pool, Process
from paddle_serving_server_gpu import serve_args
from flask import Flask, request
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-missing
def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-string-missing
gpuid = int(gpuid)
device = "gpu"
port = args.port
if gpuid == -1:
device = "cpu"
elif gpuid >= 0:
port = args.port + index
port = port + index
thread_num = args.thread
model = args.model
mem_optim = args.mem_optim_off is False
ir_optim = args.ir_optim
max_body_size = args.max_body_size
use_multilang = args.use_multilang
workdir = args.workdir
if gpuid >= 0:
workdir = "{}_{}".format(args.workdir, gpuid)
if model == "":
......@@ -67,20 +71,32 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss
if args.use_trt:
server.set_trt()
if args.use_lite:
server.set_lite()
device = "arm"
if args.use_xpu:
server.set_xpu()
if args.product_name != None:
server.set_product_name(args.product_name)
if args.container_id != None:
server.set_container_id(args.container_id)
server.load_model_config(model)
server.prepare_server(workdir=workdir, port=port, device=device)
server.prepare_server(
workdir=workdir,
port=port,
device=device,
use_encryption_model=args.use_encryption_model)
if gpuid >= 0:
server.set_gpuid(gpuid)
server.run_server()
def start_multi_card(args): # pylint: disable=doc-string-missing
def start_multi_card(args, serving_port=None): # pylint: disable=doc-string-missing
gpus = ""
if serving_port == None:
serving_port = args.port
if args.gpu_ids == "":
gpus = []
else:
......@@ -95,16 +111,21 @@ def start_multi_card(args): # pylint: disable=doc-string-missing
exit(-1)
else:
env_gpus = []
if len(gpus) <= 0:
print("gpu_ids not set, going to run cpu service.")
if args.use_lite:
print("run arm server.")
start_gpu_card_model(-1, -1, args)
elif len(gpus) <= 0:
print("gpu_ids not set, going to run cpu service.")
start_gpu_card_model(-1, -1, serving_port, args)
else:
gpu_processes = []
for i, gpu_id in enumerate(gpus):
p = Process(
target=start_gpu_card_model, args=(
target=start_gpu_card_model,
args=(
i,
gpu_id,
serving_port,
args, ))
gpu_processes.append(p)
for p in gpu_processes:
......@@ -112,10 +133,88 @@ def start_multi_card(args): # pylint: disable=doc-string-missing
for p in gpu_processes:
p.join()
class MainService(BaseHTTPRequestHandler):
def get_available_port(self):
default_port = 12000
for i in range(1000):
if port_is_available(default_port + i):
return default_port + i
def start_serving(self):
start_multi_card(args, serving_port)
def get_key(self, post_data):
if "key" not in post_data:
return False
else:
key = base64.b64decode(post_data["key"])
with open(args.model + "/key", "w") as f:
f.write(key)
return True
def check_key(self, post_data):
if "key" not in post_data:
return False
else:
key = base64.b64decode(post_data["key"])
with open(args.model + "/key", "r") as f:
cur_key = f.read()
return (key == cur_key)
def start(self, post_data):
post_data = json.loads(post_data)
global p_flag
if not p_flag:
if args.use_encryption_model:
print("waiting key for model")
if not self.get_key(post_data):
print("not found key in request")
return False
global serving_port
global p
serving_port = self.get_available_port()
p = Process(target=self.start_serving)
p.start()
time.sleep(3)
if p.is_alive():
p_flag = True
else:
return False
else:
if p.is_alive():
if not self.check_key(post_data):
return False
else:
return False
return True
def do_POST(self):
content_length = int(self.headers['Content-Length'])
post_data = self.rfile.read(content_length)
if self.start(post_data):
response = {"endpoint_list": [serving_port]}
else:
response = {"message": "start serving failed"}
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
self.wfile.write(json.dumps(response))
if __name__ == "__main__":
args = serve_args()
if args.name == "None":
from .web_service import port_is_available
if args.use_encryption_model:
p_flag = False
p = None
serving_port = 0
server = HTTPServer(('localhost', int(args.port)), MainService)
print(
'Starting encryption server, waiting for key from client, use <Ctrl-C> to stop'
)
server.serve_forever()
else:
start_multi_card(args)
else:
from .web_service import WebService
......@@ -128,7 +227,8 @@ if __name__ == "__main__":
if len(gpu_ids) > 0:
web_service.set_gpus(gpu_ids)
web_service.prepare_server(
workdir=args.workdir, port=args.port, device=args.device)
workdir=args.workdir, port=args.port, device=args.device,
use_lite=args.use_lite, use_xpu=args.use_xpu, ir_optim=args.ir_optim)
web_service.run_rpc_service()
app_instance = Flask(__name__)
......
......@@ -28,6 +28,16 @@ from paddle_serving_server_gpu import pipeline
from paddle_serving_server_gpu.pipeline import Op
def port_is_available(port):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
sock.settimeout(2)
result = sock.connect_ex(('0.0.0.0', port))
if result != 0:
return True
else:
return False
class WebService(object):
def __init__(self, name="default_service"):
self.name = name
......@@ -83,9 +93,14 @@ class WebService(object):
gpuid=0,
thread_num=2,
mem_optim=True,
use_lite=False,
use_xpu=False,
ir_optim=False):
device = "gpu"
if gpuid == -1:
if use_lite:
device = "arm"
else:
device = "cpu"
op_maker = serving.OpMaker()
read_op = op_maker.create('general_reader')
......@@ -103,6 +118,11 @@ class WebService(object):
server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim)
if use_lite:
server.set_lite()
if use_xpu:
server.set_xpu()
server.load_model_config(self.model_config)
if gpuid >= 0:
server.set_gpuid(gpuid)
......@@ -125,9 +145,11 @@ class WebService(object):
workdir="",
port=9393,
device="gpu",
use_lite=False,
use_xpu=False,
ir_optim=False,
gpuid=0,
mem_optim=True,
ir_optim=False):
mem_optim=True):
print("This API will be deprecated later. Please do not use it")
self.workdir = workdir
self.port = port
......@@ -136,7 +158,7 @@ class WebService(object):
self.port_list = []
default_port = 12000
for i in range(1000):
if self.port_is_available(default_port + i):
if port_is_available(default_port + i):
self.port_list.append(default_port + i)
if len(self.port_list) > len(self.gpus):
break
......@@ -150,6 +172,8 @@ class WebService(object):
-1,
thread_num=2,
mem_optim=mem_optim,
use_lite=use_lite,
use_xpu=use_xpu,
ir_optim=ir_optim))
else:
for i, gpuid in enumerate(self.gpus):
......@@ -160,6 +184,8 @@ class WebService(object):
gpuid,
thread_num=2,
mem_optim=mem_optim,
use_lite=use_lite,
use_xpu=use_xpu,
ir_optim=ir_optim))
def _launch_web_service(self):
......
......@@ -44,6 +44,8 @@ class LocalServiceHandler(object):
ir_optim=False,
available_port_generator=None,
use_trt=False,
use_lite=False,
use_xpu=False,
use_profile=False):
"""
Initialization of localservicehandler
......@@ -60,6 +62,8 @@ class LocalServiceHandler(object):
ir_optim: use calculation chart optimization, False default.
available_port_generator: generate available ports
use_trt: use nvidia tensorRt engine, False default.
use_lite: use Paddle-Lite engine, False default.
use_xpu: run predict on Baidu Kunlun, False default.
use_profile: use profiling, False default.
Returns:
......@@ -74,6 +78,12 @@ class LocalServiceHandler(object):
if devices == "":
# cpu
devices = [-1]
if use_lite:
self._device_type = "arm"
self._port_list.append(available_port_generator.next())
_LOGGER.info("Model({}) will be launch in arm device. Port({})"
.format(model_config, self._port_list))
else:
self._device_type = "cpu"
self._port_list.append(available_port_generator.next())
_LOGGER.info("Model({}) will be launch in cpu device. Port({})"
......@@ -96,6 +106,8 @@ class LocalServiceHandler(object):
self._rpc_service_list = []
self._server_pros = []
self._use_trt = use_trt
self._use_lite = use_lite
self._use_xpu = use_xpu
self._use_profile = use_profile
self.fetch_names_ = fetch_names
......@@ -138,8 +150,11 @@ class LocalServiceHandler(object):
if self._local_predictor_client is None:
self._local_predictor_client = LocalPredictor()
use_gpu = False
use_lite = False
if self._device_type == "gpu":
use_gpu = True
elif self._device_type == "arm":
use_lite = True
self._local_predictor_client.load_model_config(
model_path=self._model_config,
use_gpu=use_gpu,
......@@ -148,7 +163,9 @@ class LocalServiceHandler(object):
thread_num=self._thread_num,
mem_optim=self._mem_optim,
ir_optim=self._ir_optim,
use_trt=self._use_trt)
use_trt=self._use_trt,
use_lite=use_lite,
use_xpu=self._use_xpu)
return self._local_predictor_client
def get_client_config(self):
......@@ -185,7 +202,7 @@ class LocalServiceHandler(object):
server = Server()
else:
#gpu
#gpu or arm
from paddle_serving_server_gpu import OpMaker, OpSeqMaker, Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
......
......@@ -21,6 +21,7 @@ import contextlib
from contextlib import closing
import multiprocessing
import yaml
import io
from .proto import pipeline_service_pb2_grpc, pipeline_service_pb2
from . import operator
......@@ -333,7 +334,7 @@ class ServerYamlConfChecker(object):
raise SystemExit("Failed to prepare_server: only one of yml_file"
" or yml_dict can be selected as the parameter.")
if yml_file is not None:
with open(yml_file) as f:
with io.open(yml_file, encoding='utf-8') as f:
conf = yaml.load(f.read())
elif yml_dict is not None:
conf = yml_dict
......
......@@ -32,7 +32,7 @@ if '${PACK}' == 'ON':
REQUIRED_PACKAGES = [
'six >= 1.10.0', 'sentencepiece<=0.1.92', 'opencv-python<=4.2.0.32', 'pillow',
'six >= 1.10.0', 'sentencepiece', 'opencv-python', 'pillow',
'pyclipper'
]
......
......@@ -29,7 +29,7 @@ util.gen_pipeline_code("paddle_serving_server")
REQUIRED_PACKAGES = [
'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio <= 1.33.2', 'grpcio-tools <= 1.33.2',
'paddle_serving_client', 'flask >= 1.1.1', 'paddle_serving_app', 'func_timeout', 'pyyaml'
'flask >= 1.1.1', 'func_timeout', 'pyyaml'
]
packages=['paddle_serving_server',
......
......@@ -31,7 +31,7 @@ util.gen_pipeline_code("paddle_serving_server_gpu")
REQUIRED_PACKAGES = [
'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio <= 1.33.2', 'grpcio-tools <= 1.33.2',
'paddle_serving_client', 'flask >= 1.1.1', 'paddle_serving_app', 'func_timeout', 'pyyaml'
'flask >= 1.1.1', 'func_timeout', 'pyyaml'
]
packages=['paddle_serving_server_gpu',
......
......@@ -39,6 +39,8 @@ RUN yum -y install wget && \
make clean && \
echo 'export PATH=/usr/local/python3.6/bin:$PATH' >> /root/.bashrc && \
echo 'export LD_LIBRARY_PATH=/usr/local/python3.6/lib:$LD_LIBRARY_PATH' >> /root/.bashrc && \
pip install requests && \
pip3 install requests && \
source /root/.bashrc && \
cd .. && rm -rf Python-3.6.8* && \
wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/protobuf-all-3.11.2.tar.gz && \
......
......@@ -49,6 +49,8 @@ RUN yum -y install wget && \
cd .. && rm -rf protobuf-* && \
yum -y install epel-release && yum -y install patchelf libXext libSM libXrender && \
yum clean all && \
pip install requests && \
pip3 install requests && \
localedef -c -i en_US -f UTF-8 en_US.UTF-8 && \
echo "export LANG=en_US.utf8" >> /root/.bashrc && \
echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc
......@@ -23,7 +23,8 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
RUN yum -y install python-devel sqlite-devel >/dev/null \
&& curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
&& python get-pip.py >/dev/null \
&& rm get-pip.py
&& rm get-pip.py \
&& pip install requests
RUN wget http://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.bz2 \
&& yum -y install bzip2 >/dev/null \
......@@ -34,6 +35,9 @@ RUN wget http://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.bz2
&& cd .. \
&& rm -rf patchelf-0.10*
RUN yum install -y python3 python3-devel \
&& pip3 install requests
RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/protobuf-all-3.11.2.tar.gz && \
tar zxf protobuf-all-3.11.2.tar.gz && \
cd protobuf-3.11.2 && \
......@@ -41,8 +45,6 @@ RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/p
make clean && \
cd .. && rm -rf protobuf-*
RUN yum install -y python3 python3-devel
RUN yum -y update >/dev/null \
&& yum -y install dnf >/dev/null \
&& yum -y install dnf-plugins-core >/dev/null \
......
......@@ -30,11 +30,13 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
RUN yum -y install python-devel sqlite-devel \
&& curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
&& python get-pip.py >/dev/null \
&& rm get-pip.py
&& rm get-pip.py \
&& pip install requests
RUN yum install -y python3 python3-devel \
&& yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
&& yum clean all
&& yum clean all \
&& pip3 install requests
RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
&& echo "export LANG=en_US.utf8" >> /root/.bashrc \
......
......@@ -29,11 +29,13 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
RUN yum -y install python-devel sqlite-devel \
&& curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
&& python get-pip.py >/dev/null \
&& rm get-pip.py
&& rm get-pip.py \
&& pip install requests
RUN yum install -y python3 python3-devel \
&& yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
&& yum clean all
&& yum clean all \
&& pip3 install requests
RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
&& echo "export LANG=en_US.utf8" >> /root/.bashrc \
......
......@@ -19,11 +19,13 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
RUN yum -y install python-devel sqlite-devel \
&& curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
&& python get-pip.py >/dev/null \
&& rm get-pip.py
&& rm get-pip.py \
&& pip install requests
RUN yum install -y python3 python3-devel \
&& yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
&& yum clean all
&& yum clean all \
&& pip3 install requests
RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
&& echo "export LANG=en_US.utf8" >> /root/.bashrc \
......
......@@ -174,7 +174,7 @@ function python_test_fit_a_line() {
# test web
unsetproxy # maybe the proxy is used on iPipe, which makes web-test failed.
check_cmd "python -m paddle_serving_server.serve --model uci_housing_model --name uci --port 9393 --thread 4 --name uci > /dev/null &"
check_cmd "python test_server.py > /dev/null &"
sleep 5 # wait for the server to start
check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
# check http code
......@@ -183,14 +183,6 @@ function python_test_fit_a_line() {
echo "HTTP status code -ne 200"
exit 1
fi
# test web batch
check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
# check http code
http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
if [ ${http_code} -ne 200 ]; then
echo "HTTP status code -ne 200"
exit 1
fi
setproxy # recover proxy state
kill_server_process
;;
......@@ -202,27 +194,6 @@ function python_test_fit_a_line() {
check_cmd "python test_client.py uci_housing_client/serving_client_conf.prototxt > /dev/null"
kill_server_process
# test web
#unsetproxy # maybe the proxy is used on iPipe, which makes web-test failed.
#check_cmd "python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9393 --thread 2 --gpu_ids 0 --name uci > /dev/null &"
#sleep 5 # wait for the server to start
#check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
# check http code
#http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
#if [ ${http_code} -ne 200 ]; then
# echo "HTTP status code -ne 200"
# exit 1
#fi
# test web batch
#check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
# check http code
#http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
#if [ ${http_code} -ne 200 ]; then
# echo "HTTP status code -ne 200"
# exit 1
#fi
#setproxy # recover proxy state
#kill_server_process
;;
*)
echo "error type"
......@@ -514,6 +485,42 @@ function python_test_lac() {
cd ..
}
function python_test_encryption(){
#pwd: /Serving/python/examples
cd encryption
sh get_data.sh
local TYPE=$1
export SERVING_BIN=${SERIVNG_WORKDIR}/build-server-${TYPE}/core/general-server/serving
case $TYPE in
CPU)
#check_cmd "python encrypt.py"
#sleep 5
check_cmd "python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model > /dev/null &"
sleep 5
check_cmd "python test_client.py encrypt_client/serving_client_conf.prototxt"
kill_server_process
;;
GPU)
#check_cmd "python encrypt.py"
#sleep 5
check_cmd "python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0"
sleep 5
check_cmd "python test_client.py encrypt_client/serving_client_conf.prototxt"
kill_servere_process
;;
*)
echo "error type"
exit 1
;;
esac
echo "encryption $TYPE test finished as expected"
setproxy
unset SERVING_BIN
cd ..
}
function java_run_test() {
# pwd: /Serving
local TYPE=$1
......@@ -589,9 +596,6 @@ function python_test_grpc_impl() {
sleep 5 # wait for the server to start
check_cmd "python test_sync_client.py > /dev/null"
check_cmd "python test_asyn_client.py > /dev/null"
check_cmd "python test_general_pb_client.py > /dev/null"
check_cmd "python test_numpy_input_client.py > /dev/null"
check_cmd "python test_batch_client.py > /dev/null"
check_cmd "python test_timeout_client.py > /dev/null"
kill_server_process
kill_process_by_port 9393
......@@ -600,9 +604,6 @@ function python_test_grpc_impl() {
sleep 5 # wait for the server to start
check_cmd "python test_sync_client.py > /dev/null"
check_cmd "python test_asyn_client.py > /dev/null"
check_cmd "python test_general_pb_client.py > /dev/null"
check_cmd "python test_numpy_input_client.py > /dev/null"
check_cmd "python test_batch_client.py > /dev/null"
check_cmd "python test_timeout_client.py > /dev/null"
kill_server_process
kill_process_by_port 9393
......@@ -651,9 +652,7 @@ COMMENT
sleep 5 # wait for the server to start
check_cmd "python test_sync_client.py > /dev/null"
check_cmd "python test_asyn_client.py > /dev/null"
check_cmd "python test_general_pb_client.py > /dev/null"
check_cmd "python test_numpy_input_client.py > /dev/null"
check_cmd "python test_batch_client.py > /dev/null"
#check_cmd "python test_batch_client.py > /dev/null"
check_cmd "python test_timeout_client.py > /dev/null"
kill_server_process
kill_process_by_port 9393
......@@ -662,9 +661,7 @@ COMMENT
sleep 5 # wait for the server to start
check_cmd "python test_sync_client.py > /dev/null"
check_cmd "python test_asyn_client.py > /dev/null"
check_cmd "python test_general_pb_client.py > /dev/null"
check_cmd "python test_numpy_input_client.py > /dev/null"
check_cmd "python test_batch_client.py > /dev/null"
#check_cmd "python test_batch_client.py > /dev/null"
check_cmd "python test_timeout_client.py > /dev/null"
kill_server_process
kill_process_by_port 9393
......@@ -960,6 +957,7 @@ function python_run_test() {
python_test_lac $TYPE # pwd: /Serving/python/examples
python_test_multi_process $TYPE # pwd: /Serving/python/examples
python_test_multi_fetch $TYPE # pwd: /Serving/python/examples
python_test_encryption $TYPE # pwd: /Serving/python/examples
python_test_yolov4 $TYPE # pwd: /Serving/python/examples
python_test_grpc_impl $TYPE # pwd: /Serving/python/examples
python_test_resnet50 $TYPE # pwd: /Serving/python/examples
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册