未验证 提交 6b1c492d 编写于 作者: B BohaoWu 提交者: GitHub

Merge branch 'develop' into Zelda

......@@ -49,6 +49,9 @@ set(THIRD_PARTY_BUILD_TYPE Release)
option(WITH_AVX "Compile Paddle Serving with AVX intrinsics" OFF)
option(WITH_MKL "Compile Paddle Serving with MKL support." OFF)
option(WITH_GPU "Compile Paddle Serving with NVIDIA GPU" OFF)
option(WITH_LITE "Compile Paddle Serving with Paddle Lite Engine" OFF)
option(WITH_XPU "Compile Paddle Serving with Baidu Kunlun" OFF)
option(WITH_PYTHON "Compile Paddle Serving with Python" ON)
option(CLIENT "Compile Paddle Serving Client" OFF)
option(SERVER "Compile Paddle Serving Server" OFF)
option(APP "Compile Paddle Serving App package" OFF)
......@@ -66,40 +69,40 @@ if (NOT DEFINED WITH_MKLDNN)
endif()
endif()
if (SERVER)
include(external/jsoncpp)
#include(external/rocksdb)
endif()
if (SERVER OR CLIENT)
include(external/snappy)
include(external/leveldb)
include(external/zlib)
include(external/boost)
include(external/protobuf)
include(external/brpc)
include(external/gflags)
include(external/glog)
include(external/pybind11)
include(external/python)
include(generic)
include(flags)
include(external/snappy)
include(external/leveldb)
include(external/zlib)
include(external/boost)
include(external/protobuf)
include(external/brpc)
include(external/gflags)
include(external/glog)
if (WITH_PYTHON)
include(external/pybind11)
include(external/python)
endif()
include(generic)
include(flags)
endif()
if (APP)
include(external/zlib)
include(external/boost)
include(external/protobuf)
include(external/gflags)
include(external/glog)
include(external/pybind11)
include(external/python)
include(generic)
include(external/zlib)
include(external/boost)
include(external/protobuf)
include(external/gflags)
include(external/glog)
include(external/pybind11)
include(external/python)
include(generic)
endif()
if (SERVER)
include(external/cudnn)
include(paddlepaddle)
include(external/jsoncpp)
#include(external/rocksdb)
include(external/cudnn)
include(paddlepaddle)
endif()
message("paddle serving source dir: " ${PADDLE_SERVING_SOURCE_DIR})
......@@ -125,26 +128,24 @@ set(EXTERNAL_LIBS
)
if(SERVER)
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
endif()
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
if(SERVER)
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
endif()
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
if (SERVER)
list(APPEND EXTERNAL_LIBS paddlepaddle)
endif()
add_subdirectory(core)
if(SERVER)
add_subdirectory(paddle_inference)
add_subdirectory(paddle_inference)
endif()
add_subdirectory(python)
if (WITH_PYTHON)
add_subdirectory(python)
endif()
......@@ -47,9 +47,10 @@ nvidia-docker exec -it test bash
```shell
pip install paddle-serving-client==0.4.0
pip install paddle-serving-server==0.4.0 # CPU
pip install paddle-serving-app==0.2.0
pip install paddle-serving-server-gpu==0.4.0.post9 # GPU with CUDA9.0
pip install paddle-serving-server-gpu==0.4.0.post10 # GPU with CUDA10.0
pip install paddle-serving-server-gpu==0.4.0.trt # GPU with CUDA10.1+TensorRT
pip install paddle-serving-server-gpu==0.4.0.100 # GPU with CUDA10.1+TensorRT
```
You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source, add `-i https://pypi.tuna.tsinghua.edu.cn/simple` to pip command) to speed up the download.
......
......@@ -49,9 +49,10 @@ nvidia-docker exec -it test bash
```shell
pip install paddle-serving-client==0.4.0
pip install paddle-serving-server==0.4.0 # CPU
pip install paddle-serving-app==0.2.0
pip install paddle-serving-server-gpu==0.4.0.post9 # GPU with CUDA9.0
pip install paddle-serving-server-gpu==0.4.0.post10 # GPU with CUDA10.0
pip install paddle-serving-server-gpu==0.4.0.trt # GPU with CUDA10.1+TensorRT
pip install paddle-serving-server-gpu==0.4.0.100 # GPU with CUDA10.1+TensorRT
```
您可能需要使用国内镜像源(例如清华源, 在pip命令中添加`-i https://pypi.tuna.tsinghua.edu.cn/simple`)来加速下载。
......
......@@ -22,8 +22,9 @@ set(BOOST_PROJECT "extern_boost")
# version of boost, say, 1.66.0, doesn't build on CentOS 6. We
# checked that the devtools package of CentOS 6 installs boost 1.41.0.
# So we use 1.41.0 here.
set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
set(BOOST_VER "1.74.0")
set(BOOST_TAR "boost_1_74_0" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
......
......@@ -13,6 +13,9 @@
# limitations under the License.
INCLUDE(ExternalProject)
set(BRPC_CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-narrowing")
set(BRPC_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(BRPC_CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -Wno-narrowing")
find_package(OpenSSL REQUIRED)
......@@ -35,19 +38,28 @@ INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
if(WITH_LITE)
set(BRPC_REPO "https://github.com/zhangjun/incubator-brpc.git")
set(BRPC_TAG "master")
else()
set(BRPC_REPO "https://github.com/wangjiawei04/brpc")
set(BRPC_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47")
endif()
# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF
ExternalProject_Add(
extern_brpc
${EXTERNAL_PROJECT_LOG_ARGS}
# TODO(gongwb): change to de newst repo when they changed.
GIT_REPOSITORY "https://github.com/wangjiawei04/brpc"
GIT_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47"
GIT_REPOSITORY ${BRPC_REPO}
GIT_TAG ${BRPC_TAG}
PREFIX ${BRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_CXX_FLAGS=${BRPC_CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${BRPC_CMAKE_C_FLAGS}
-DCMAKE_CPP_FLAGS=${BRPC_CMAKE_CPP_FLAGS}
-DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
......
......@@ -93,7 +93,11 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
if(NOT APPLE)
find_package(Threads REQUIRED)
link_libraries(${CMAKE_THREAD_LIBS_INIT})
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
if(WITH_LITE OR WITH_XPU)
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -fopenmp -pthread -ldl -lrt")
else()
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
endif()
endif(NOT APPLE)
set_property(GLOBAL PROPERTY FLUID_MODULES "")
......
......@@ -31,14 +31,20 @@ message( "WITH_GPU = ${WITH_GPU}")
# Paddle Version should be one of:
# latest: latest develop build
# version number like 1.5.2
SET(PADDLE_VERSION "1.8.4")
SET(PADDLE_VERSION "2.0.0-rc1")
if (WITH_GPU)
if (WITH_TRT)
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6")
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7-avx-mkl-trt6")
else()
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl")
endif()
elseif (WITH_LITE)
if (WITH_XPU)
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm-xpu")
else()
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm")
endif()
else()
if (WITH_AVX)
if (WITH_MKLML)
......@@ -51,7 +57,12 @@ else()
endif()
endif()
SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/fluid_inference.tgz")
if(WITH_LITE)
SET(PADDLE_LIB_PATH "http://paddle-serving.bj.bcebos.com/inferlib/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
else()
SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
endif()
MESSAGE(STATUS "PADDLE_LIB_PATH=${PADDLE_LIB_PATH}")
if (WITH_GPU OR WITH_MKLML)
if (WITH_TRT)
......@@ -117,11 +128,24 @@ ADD_LIBRARY(paddle_fluid SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.so)
if (WITH_TRT)
ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so)
ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so)
ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so)
endif()
ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so)
if (WITH_LITE)
ADD_LIBRARY(paddle_api_full_bundled STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET paddle_api_full_bundled PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/lite/cxx/lib/libpaddle_api_full_bundled.a)
if (WITH_XPU)
ADD_LIBRARY(xpuapi SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET xpuapi PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xpu/lib/libxpuapi.so)
ADD_LIBRARY(xpurt SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET xpurt PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xpu/lib/libxpurt.so)
endif()
endif()
ADD_LIBRARY(xxhash STATIC IMPORTED GLOBAL)
......@@ -132,7 +156,14 @@ LIST(APPEND external_project_dependencies paddle)
LIST(APPEND paddle_depend_libs
xxhash)
if(WITH_LITE)
LIST(APPEND paddle_depend_libs paddle_api_full_bundled)
if(WITH_XPU)
LIST(APPEND paddle_depend_libs xpuapi xpurt)
endif()
endif()
if(WITH_TRT)
LIST(APPEND paddle_depend_libs
nvinfer nvinfer_plugin)
LIST(APPEND paddle_depend_libs
nvinfer nvinfer_plugin)
endif()
......@@ -14,10 +14,6 @@ list(APPEND configure_srcs ${CMAKE_CURRENT_LIST_DIR}/src/configure_parser.cpp)
add_library(configure ${configure_srcs})
add_dependencies(configure brpc)
add_executable(test_configure
${CMAKE_CURRENT_LIST_DIR}/tests/test_configure.cpp)
target_link_libraries(test_configure configure protobuf)
install(TARGETS configure
ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
)
......@@ -31,6 +27,8 @@ install(FILES ${inc}
DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure)
endif()
if (WITH_PYTHON)
py_proto_compile(general_model_config_py_proto SRCS proto/general_model_config.proto)
add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
......@@ -45,19 +43,19 @@ add_custom_target(sdk_configure_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E to
add_dependencies(sdk_configure_py_proto sdk_configure_py_proto_init)
add_custom_command(TARGET sdk_configure_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMENT "Copy generated python proto into directory paddle_serving_client/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_custom_command(TARGET general_model_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMENT "Copy generated general_model_config proto file into directory paddle_serving_client/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_client/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
......@@ -65,7 +63,7 @@ endif()
if (APP)
add_custom_command(TARGET general_model_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
COMMENT "Copy generated general_model_config proto file into directory paddle_serving_app/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
......@@ -74,29 +72,29 @@ if (SERVER)
py_proto_compile(server_config_py_proto SRCS proto/server_configure.proto)
add_custom_target(server_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(server_config_py_proto server_config_py_proto_init)
if (NOT WITH_GPU)
if (NOT WITH_GPU AND NOT WITH_LITE)
add_custom_command(TARGET server_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMENT "Copy generated python proto into directory paddle_serving_server/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINRARY_DIR})
add_custom_command(TARGET general_model_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
else()
add_custom_command(TARGET server_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory
${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMAND cp *.py
COMMAND cp -f *.py
${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMENT "Copy generated python proto into directory
paddle_serving_server_gpu/proto."
......@@ -105,7 +103,7 @@ add_custom_command(TARGET server_config_py_proto POST_BUILD
add_custom_command(TARGET general_model_config_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory
${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMAND cp *.py
COMMAND cp -f *.py
${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMENT "Copy generated general_model_config proto file into directory
paddle_serving_server_gpu/proto."
......@@ -113,8 +111,10 @@ add_custom_command(TARGET general_model_config_py_proto POST_BUILD
add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server_gpu/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
endif()
endif()
......@@ -45,6 +45,8 @@ message EngineDesc {
optional bool force_update_static_cache = 15;
optional bool enable_ir_optimization = 16;
optional bool use_trt = 17;
optional bool use_lite = 18;
optional bool use_xpu = 19;
};
// model_toolkit conf
......
......@@ -6,6 +6,11 @@ add_dependencies(serving pdcodegen fluid_cpu_engine pdserving paddle_fluid cube-
if (WITH_GPU)
add_dependencies(serving fluid_gpu_engine)
endif()
if (WITH_LITE)
add_dependencies(serving fluid_arm_engine)
endif()
target_include_directories(serving PUBLIC
${CMAKE_CURRENT_BINARY_DIR}/../../core/predictor
)
......@@ -15,6 +20,11 @@ if(WITH_GPU)
-Wl,--no-whole-archive)
endif()
if(WITH_LITE)
target_link_libraries(serving -Wl,--whole-archive fluid_arm_engine
-Wl,--no-whole-archive)
endif()
target_link_libraries(serving -Wl,--whole-archive fluid_cpu_engine
-Wl,--no-whole-archive)
......
......@@ -38,145 +38,7 @@ using baidu::paddle_serving::predictor::general_model::FetchInst;
using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
int GeneralDistKVInferOp::inference() {
VLOG(2) << "Going to run inference";
const std::vector<std::string> pre_node_names = pre_names();
if (pre_node_names.size() != 1) {
LOG(ERROR) << "This op(" << op_name()
<< ") can only have one predecessor op, but received "
<< pre_node_names.size();
return -1;
}
const std::string pre_name = pre_node_names[0];
const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
uint64_t log_id = input_blob->GetLogId();
VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
GeneralBlob *output_blob = mutable_data<GeneralBlob>();
if (!input_blob) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed mutable depended argument, op:" << pre_name;
return -1;
}
const TensorVector *in = &input_blob->tensor_vector;
TensorVector *out = &output_blob->tensor_vector;
int batch_size = input_blob->GetBatchSize();
VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
std::vector<uint64_t> keys;
std::vector<rec::mcube::CubeValue> values;
int sparse_count = 0;
int dense_count = 0;
std::vector<std::pair<int64_t *, size_t>> dataptr_size_pairs;
size_t key_len = 0;
for (size_t i = 0; i < in->size(); ++i) {
if (in->at(i).dtype != paddle::PaddleDType::INT64) {
++dense_count;
continue;
}
++sparse_count;
size_t elem_num = 1;
for (size_t s = 0; s < in->at(i).shape.size(); ++s) {
elem_num *= in->at(i).shape[s];
}
key_len += elem_num;
int64_t *data_ptr = static_cast<int64_t *>(in->at(i).data.data());
dataptr_size_pairs.push_back(std::make_pair(data_ptr, elem_num));
}
keys.resize(key_len);
int key_idx = 0;
for (size_t i = 0; i < dataptr_size_pairs.size(); ++i) {
std::copy(dataptr_size_pairs[i].first,
dataptr_size_pairs[i].first + dataptr_size_pairs[i].second,
keys.begin() + key_idx);
key_idx += dataptr_size_pairs[i].second;
}
Timer timeline;
int64_t cube_start = timeline.TimeStampUS();
timeline.Start();
rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
std::vector<std::string> table_names = cube->get_table_names();
if (table_names.size() == 0) {
LOG(ERROR) << "(logid=" << log_id
<< ") cube init error or cube config not given.";
return -1;
}
int ret = cube->seek(table_names[0], keys, &values);
int64_t cube_end = timeline.TimeStampUS();
if (values.size() != keys.size() || values[0].buff.size() == 0) {
LOG(ERROR) << "(logid=" << log_id << ") cube value return null";
}
size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
TensorVector sparse_out;
sparse_out.resize(sparse_count);
TensorVector dense_out;
dense_out.resize(dense_count);
int cube_val_idx = 0;
int sparse_idx = 0;
int dense_idx = 0;
std::unordered_map<int, int> in_out_map;
baidu::paddle_serving::predictor::Resource &resource =
baidu::paddle_serving::predictor::Resource::instance();
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config();
for (size_t i = 0; i < in->size(); ++i) {
if (in->at(i).dtype != paddle::PaddleDType::INT64) {
dense_out[dense_idx] = in->at(i);
++dense_idx;
continue;
}
sparse_out[sparse_idx].lod.resize(in->at(i).lod.size());
for (size_t x = 0; x < sparse_out[sparse_idx].lod.size(); ++x) {
sparse_out[sparse_idx].lod[x].resize(in->at(i).lod[x].size());
std::copy(in->at(i).lod[x].begin(),
in->at(i).lod[x].end(),
sparse_out[sparse_idx].lod[x].begin());
}
sparse_out[sparse_idx].dtype = paddle::PaddleDType::FLOAT32;
sparse_out[sparse_idx].shape.push_back(
sparse_out[sparse_idx].lod[0].back());
sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
sparse_out[sparse_idx].name = model_config->_feed_name[i];
sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
EMBEDDING_SIZE * sizeof(float));
float *dst_ptr = static_cast<float *>(sparse_out[sparse_idx].data.data());
for (int x = 0; x < sparse_out[sparse_idx].lod[0].back(); ++x) {
float *data_ptr = dst_ptr + x * EMBEDDING_SIZE;
memcpy(data_ptr,
values[cube_val_idx].buff.data(),
values[cube_val_idx].buff.size());
cube_val_idx++;
}
++sparse_idx;
}
TensorVector infer_in;
infer_in.insert(infer_in.end(), dense_out.begin(), dense_out.end());
infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end());
output_blob->SetBatchSize(batch_size);
output_blob->SetLogId(log_id);
VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
int64_t start = timeline.TimeStampUS();
if (InferManager::instance().infer(
engine_name().c_str(), &infer_in, out, batch_size)) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed do infer in fluid model: " << engine_name();
return -1;
}
int64_t end = timeline.TimeStampUS();
CopyBlobInfo(input_blob, output_blob);
AddBlobInfo(output_blob, cube_start);
AddBlobInfo(output_blob, cube_end);
AddBlobInfo(output_blob, start);
AddBlobInfo(output_blob, end);
return 0;
}
int GeneralDistKVInferOp::inference() { return 0; }
DEFINE_OP(GeneralDistKVInferOp);
} // namespace serving
......
......@@ -188,21 +188,6 @@ int GeneralDistKVQuantInferOp::inference() {
VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
Timer timeline;
int64_t start = timeline.TimeStampUS();
timeline.Start();
if (InferManager::instance().infer(
engine_name().c_str(), &infer_in, out, batch_size)) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed do infer in fluid model: " << engine_name();
return -1;
}
int64_t end = timeline.TimeStampUS();
CopyBlobInfo(input_blob, output_blob);
AddBlobInfo(output_blob, start);
AddBlobInfo(output_blob, end);
return 0;
}
DEFINE_OP(GeneralDistKVQuantInferOp);
......
......@@ -44,45 +44,9 @@ int GeneralInferOp::inference() {
<< pre_node_names.size();
return -1;
}
const std::string pre_name = pre_node_names[0];
const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
uint64_t log_id = input_blob->GetLogId();
VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
GeneralBlob *output_blob = mutable_data<GeneralBlob>();
output_blob->SetLogId(log_id);
if (!input_blob) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed mutable depended argument, op:" << pre_name;
if (InferManager::instance().infer(engine_name().c_str())) {
return -1;
}
const TensorVector *in = &input_blob->tensor_vector;
TensorVector *out = &output_blob->tensor_vector;
int batch_size = input_blob->_batch_size;
VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
output_blob->_batch_size = batch_size;
VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
Timer timeline;
int64_t start = timeline.TimeStampUS();
timeline.Start();
if (InferManager::instance().infer(
engine_name().c_str(), in, out, batch_size)) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed do infer in fluid model: " << engine_name().c_str();
return -1;
}
int64_t end = timeline.TimeStampUS();
CopyBlobInfo(input_blob, output_blob);
AddBlobInfo(output_blob, start);
AddBlobInfo(output_blob, end);
return 0;
}
DEFINE_OP(GeneralInferOp);
......
......@@ -20,6 +20,7 @@
#include "core/general-server/op/general_infer_helper.h"
#include "core/predictor/framework/infer.h"
#include "core/predictor/framework/memory.h"
#include "core/predictor/framework/resource.h"
#include "core/util/include/timer.h"
namespace baidu {
......@@ -32,6 +33,7 @@ using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FeedInst;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
using baidu::paddle_serving::predictor::InferManager;
int conf_check(const Request *req,
const std::shared_ptr<PaddleGeneralModelConfig> &model_config) {
......@@ -71,75 +73,34 @@ int conf_check(const Request *req,
int GeneralReaderOp::inference() {
// reade request from client
// TODO: only support one engine here
std::string engine_name = "general_infer_0";
const Request *req = dynamic_cast<const Request *>(get_request_message());
uint64_t log_id = req->log_id();
int input_var_num = 0;
std::vector<int64_t> elem_type;
std::vector<int64_t> elem_size;
std::vector<int64_t> capacity;
GeneralBlob *res = mutable_data<GeneralBlob>();
TensorVector *out = &res->tensor_vector;
res->SetLogId(log_id);
if (!res) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed get op tls reader object output";
}
Timer timeline;
int64_t start = timeline.TimeStampUS();
int var_num = req->insts(0).tensor_array_size();
VLOG(2) << "(logid=" << log_id << ") var num: " << var_num;
VLOG(2) << "(logid=" << log_id
<< ") start to call load general model_conf op";
baidu::paddle_serving::predictor::Resource &resource =
baidu::paddle_serving::predictor::Resource::instance();
VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config();
VLOG(2) << "(logid=" << log_id << ") print general model config done.";
// TODO(guru4elephant): how to do conditional check?
/*
int ret = conf_check(req, model_config);
if (ret != 0) {
LOG(ERROR) << "model conf of server:";
resource.print_general_model_config(model_config);
return 0;
}
*/
// package tensor
elem_type.resize(var_num);
elem_size.resize(var_num);
capacity.resize(var_num);
// prepare basic information for input
for (int i = 0; i < var_num; ++i) {
paddle::PaddleTensor lod_tensor;
elem_type[i] = req->insts(0).tensor_array(i).elem_type();
VLOG(2) << "var[" << i << "] has elem type: " << elem_type[i];
if (elem_type[i] == 0) { // int64
elem_size[i] = sizeof(int64_t);
lod_tensor.dtype = paddle::PaddleDType::INT64;
} else if (elem_type[i] == 1) {
elem_size[i] = sizeof(float);
lod_tensor.dtype = paddle::PaddleDType::FLOAT32;
} else if (elem_type[i] == 2) {
elem_size[i] = sizeof(int32_t);
lod_tensor.dtype = paddle::PaddleDType::INT32;
}
// implement lod tensor here
std::string tensor_name = model_config->_feed_name[i];
VLOG(2) << "(logid=" << log_id << ") get tensor name: " << tensor_name;
auto lod_tensor = InferManager::instance().GetInputHandle(
engine_name.c_str(), tensor_name.c_str());
std::vector<std::vector<size_t>> lod;
std::vector<int> shape;
// get lod info here
if (req->insts(0).tensor_array(i).lod_size() > 0) {
VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
lod_tensor.lod.resize(1);
lod.resize(1);
for (int k = 0; k < req->insts(0).tensor_array(i).lod_size(); ++k) {
lod_tensor.lod[0].push_back(req->insts(0).tensor_array(i).lod(k));
lod[0].push_back(req->insts(0).tensor_array(i).lod(k));
}
capacity[i] = 1;
for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
......@@ -147,7 +108,7 @@ int GeneralReaderOp::inference() {
VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
<< "]: " << dim;
capacity[i] *= dim;
lod_tensor.shape.push_back(dim);
shape.push_back(dim);
}
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] is tensor, capacity: " << capacity[i];
......@@ -158,92 +119,41 @@ int GeneralReaderOp::inference() {
VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
<< "]: " << dim;
capacity[i] *= dim;
lod_tensor.shape.push_back(dim);
shape.push_back(dim);
}
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] is tensor, capacity: " << capacity[i];
}
lod_tensor.name = model_config->_feed_name[i];
out->push_back(lod_tensor);
}
// specify the memory needed for output tensor_vector
for (int i = 0; i < var_num; ++i) {
if (out->at(i).lod.size() == 1) {
int tensor_size = 0;
const Tensor &tensor = req->insts(0).tensor_array(i);
int data_len = 0;
if (tensor.int64_data_size() > 0) {
data_len = tensor.int64_data_size();
} else if (tensor.float_data_size() > 0) {
data_len = tensor.float_data_size();
} else if (tensor.int_data_size() > 0) {
data_len = tensor.int_data_size();
}
VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
<< "]: " << data_len;
tensor_size += data_len;
int cur_len = out->at(i).lod[0].back();
VLOG(2) << "(logid=" << log_id << ") current len: " << cur_len;
int sample_len = 0;
if (tensor.shape_size() == 1) {
sample_len = data_len;
} else {
sample_len = tensor.shape(0);
}
VLOG(2) << "(logid=" << log_id << ") new len: " << cur_len + sample_len;
out->at(i).data.Resize(tensor_size * elem_size[i]);
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] is lod_tensor and len=" << out->at(i).lod[0].back();
} else {
out->at(i).data.Resize(capacity[i] * elem_size[i]);
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] is tensor and capacity=" << capacity[i];
}
}
// fill the data into output general_blob
for (int i = 0; i < var_num; ++i) {
if (elem_type[i] == 0) {
int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
<< "] is " << req->insts(0).tensor_array(i).int64_data(0);
int offset = 0;
lod_tensor->SetLoD(lod);
lod_tensor->Reshape(shape);
// insert data here
if (req->insts(0).tensor_array(i).elem_type() == 0) {
// TODO: Copy twice here, can optimize
int elem_num = req->insts(0).tensor_array(i).int64_data_size();
std::vector<int64_t> data(elem_num);
int64_t *dst_ptr = data.data();
for (int k = 0; k < elem_num; ++k) {
dst_ptr[offset + k] = req->insts(0).tensor_array(i).int64_data(k);
dst_ptr[k] = req->insts(0).tensor_array(i).int64_data(k);
}
} else if (elem_type[i] == 1) {
float *dst_ptr = static_cast<float *>(out->at(i).data.data());
VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
<< "] is " << req->insts(0).tensor_array(i).float_data(0);
int offset = 0;
lod_tensor->CopyFromCpu(dst_ptr);
} else if (req->insts(0).tensor_array(i).elem_type() == 1) {
int elem_num = req->insts(0).tensor_array(i).float_data_size();
std::vector<float> data(elem_num);
float *dst_ptr = data.data();
for (int k = 0; k < elem_num; ++k) {
dst_ptr[offset + k] = req->insts(0).tensor_array(i).float_data(k);
dst_ptr[k] = req->insts(0).tensor_array(i).float_data(k);
}
} else if (elem_type[i] == 2) {
int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
<< "] is " << req->insts(0).tensor_array(i).int_data(0);
int offset = 0;
lod_tensor->CopyFromCpu(dst_ptr);
} else if (req->insts(0).tensor_array(i).elem_type() == 2) {
int elem_num = req->insts(0).tensor_array(i).int_data_size();
std::vector<int32_t> data(elem_num);
int32_t *dst_ptr = data.data();
for (int k = 0; k < elem_num; ++k) {
dst_ptr[offset + k] = req->insts(0).tensor_array(i).int_data(k);
dst_ptr[k] = req->insts(0).tensor_array(i).int_data(k);
}
lod_tensor->CopyFromCpu(dst_ptr);
}
}
VLOG(2) << "(logid=" << log_id << ") output size: " << out->size();
timeline.Pause();
int64_t end = timeline.TimeStampUS();
res->p_size = 0;
res->_batch_size = 1;
AddBlobInfo(res, start);
AddBlobInfo(res, end);
VLOG(2) << "(logid=" << log_id << ") read data from client success";
return 0;
}
DEFINE_OP(GeneralReaderOp);
......
......@@ -40,160 +40,60 @@ using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
int GeneralResponseOp::inference() {
const std::vector<std::string> pre_node_names = pre_names();
VLOG(2) << "pre node names size: " << pre_node_names.size();
const GeneralBlob *input_blob;
uint64_t log_id =
get_depend_argument<GeneralBlob>(pre_node_names[0])->GetLogId();
const Request *req = dynamic_cast<const Request *>(get_request_message());
// response inst with only fetch_var_names
Response *res = mutable_data<Response>();
Timer timeline;
// double response_time = 0.0;
// timeline.Start();
int64_t start = timeline.TimeStampUS();
VLOG(2) << "(logid=" << log_id
<< ") start to call load general model_conf op";
baidu::paddle_serving::predictor::Resource &resource =
baidu::paddle_serving::predictor::Resource::instance();
VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config();
VLOG(2) << "(logid=" << log_id
<< ") max body size : " << brpc::fLU64::FLAGS_max_body_size;
std::vector<int> fetch_index;
fetch_index.resize(req->fetch_var_names_size());
std::vector<int> capacity(req->fetch_var_names_size(), 1);
std::string engine_name = "general_infer_0";
ModelOutput *output = res->add_outputs();
FetchInst *fetch_inst = output->add_insts();
FetchInst *fetch_p = output->mutable_insts(0);
std::vector<std::string> outs =
InferManager::instance().GetOutputNames(engine_name.c_str());
for (int i = 0; i < req->fetch_var_names_size(); ++i) {
fetch_index[i] =
model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
}
for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
const std::string &pre_name = pre_node_names[pi];
VLOG(2) << "(logid=" << log_id << ") pre names[" << pi << "]: " << pre_name
<< " (" << pre_node_names.size() << ")";
input_blob = get_depend_argument<GeneralBlob>(pre_name);
// fprintf(stderr, "input(%s) blob address %x\n", pre_names.c_str(),
// input_blob);
if (!input_blob) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed mutable depended argument, op: " << pre_name;
return -1;
Tensor *tensor = fetch_inst->add_tensor_array();
std::string tensor_name = outs[i];
auto lod_tensor = InferManager::instance().GetOutputHandle(
engine_name.c_str(), tensor_name.c_str());
std::vector<int> shape = lod_tensor->shape();
for (int k = 0; k < shape.size(); ++k) {
capacity[i] *= shape[k];
tensor->add_shape(shape[k]);
}
const TensorVector *in = &input_blob->tensor_vector;
ModelOutput *output = res->add_outputs();
// To get the order of model return values
output->set_engine_name(pre_name);
FetchInst *fetch_inst = output->add_insts();
for (auto &idx : fetch_index) {
Tensor *tensor = fetch_inst->add_tensor_array();
if (model_config->_is_lod_fetch[idx]) {
VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
<< model_config->_fetch_name[idx] << " is lod_tensor";
for (int k = 0; k < in->at(idx).shape.size(); ++k) {
VLOG(2) << "(logid=" << log_id << ") shape[" << k
<< "]: " << in->at(idx).shape[k];
tensor->add_shape(in->at(idx).shape[k]);
}
} else {
VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
<< model_config->_fetch_name[idx] << " is tensor";
for (int k = 0; k < in->at(idx).shape.size(); ++k) {
VLOG(2) << "(logid=" << log_id << ") shape[" << k
<< "]: " << in->at(idx).shape[k];
tensor->add_shape(in->at(idx).shape[k]);
}
}
auto dtype = lod_tensor->type();
if (dtype == paddle::PaddleDType::INT64) {
std::vector<int64_t> datas(capacity[i]);
int64_t *data_ptr = datas.data();
lod_tensor->CopyToCpu(data_ptr);
google::protobuf::RepeatedField<int64_t> tmp_data(data_ptr,
data_ptr + capacity[i]);
tensor->mutable_int64_data()->Swap(&tmp_data);
} else if (dtype == paddle::PaddleDType::FLOAT32) {
std::vector<float> datas(capacity[i]);
float *data_ptr = datas.data();
lod_tensor->CopyToCpu(data_ptr);
google::protobuf::RepeatedField<float> tmp_data(data_ptr,
data_ptr + capacity[i]);
tensor->mutable_float_data()->Swap(&tmp_data);
} else if (dtype == paddle::PaddleDType::INT32) {
std::vector<int32_t> datas(capacity[i]);
int32_t *data_ptr = datas.data();
lod_tensor->CopyToCpu(data_ptr);
google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
data_ptr + capacity[i]);
tensor->mutable_int_data()->Swap(&tmp_data);
}
int var_idx = 0;
for (auto &idx : fetch_index) {
int cap = 1;
for (int j = 0; j < in->at(idx).shape.size(); ++j) {
cap *= in->at(idx).shape[j];
std::vector<std::vector<size_t>> lod = lod_tensor->lod();
if (lod.size() > 0) {
for (int j = 0; j < lod[0].size(); ++j) {
tensor->add_lod(lod[0][j]);
}
FetchInst *fetch_p = output->mutable_insts(0);
auto dtype = in->at(idx).dtype;
if (dtype == paddle::PaddleDType::INT64) {
VLOG(2) << "(logid=" << log_id << ") Prepare int64 var ["
<< model_config->_fetch_name[idx] << "].";
int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data());
// from
// https://stackoverflow.com/questions/15499641/copy-a-stdvector-to-a-repeated-field-from-protobuf-with-memcpy
// `Swap` method is faster than `{}` method.
google::protobuf::RepeatedField<int64_t> tmp_data(data_ptr,
data_ptr + cap);
fetch_p->mutable_tensor_array(var_idx)->mutable_int64_data()->Swap(
&tmp_data);
} else if (dtype == paddle::PaddleDType::FLOAT32) {
VLOG(2) << "(logid=" << log_id << ") Prepare float var ["
<< model_config->_fetch_name[idx] << "].";
float *data_ptr = static_cast<float *>(in->at(idx).data.data());
google::protobuf::RepeatedField<float> tmp_data(data_ptr,
data_ptr + cap);
fetch_p->mutable_tensor_array(var_idx)->mutable_float_data()->Swap(
&tmp_data);
} else if (dtype == paddle::PaddleDType::INT32) {
VLOG(2) << "(logid=" << log_id << ")Prepare int32 var ["
<< model_config->_fetch_name[idx] << "].";
int32_t *data_ptr = static_cast<int32_t *>(in->at(idx).data.data());
google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
data_ptr + cap);
fetch_p->mutable_tensor_array(var_idx)->mutable_int_data()->Swap(
&tmp_data);
}
if (model_config->_is_lod_fetch[idx]) {
if (in->at(idx).lod.size() > 0) {
for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
fetch_p->mutable_tensor_array(var_idx)->add_lod(
in->at(idx).lod[0][j]);
}
}
}
VLOG(2) << "(logid=" << log_id << ") fetch var ["
<< model_config->_fetch_name[idx] << "] ready";
var_idx++;
}
}
if (req->profile_server()) {
int64_t end = timeline.TimeStampUS();
// TODO(barriery): multi-model profile_time.
// At present, only the response_op is multi-input, so here we get
// the profile_time by hard coding. It needs to be replaced with
// a more elegant way.
for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
input_blob = get_depend_argument<GeneralBlob>(pre_node_names[pi]);
VLOG(2) << "(logid=" << log_id
<< ") p size for input blob: " << input_blob->p_size;
int profile_time_idx = -1;
if (pi == 0) {
profile_time_idx = 0;
} else {
profile_time_idx = input_blob->p_size - 2;
}
for (; profile_time_idx < input_blob->p_size; ++profile_time_idx) {
res->add_profile_time(input_blob->time_stamp[profile_time_idx]);
}
}
// TODO(guru4elephant): find more elegant way to do this
res->add_profile_time(start);
res->add_profile_time(end);
}
return 0;
}
......
......@@ -7,6 +7,7 @@ PROTOBUF_GENERATE_CPP(pdcodegen_proto_srcs pdcodegen_proto_hdrs
LIST(APPEND pdcodegen_srcs ${pdcodegen_proto_srcs})
add_executable(pdcodegen ${pdcodegen_srcs})
add_dependencies(pdcodegen boost)
target_link_libraries(pdcodegen protobuf ${PROTOBUF_PROTOC_LIBRARY})
# install
......
......@@ -12,13 +12,12 @@ set_source_files_properties(
${pdserving_srcs}
PROPERTIES
COMPILE_FLAGS "-Wno-strict-aliasing -Wno-unused-variable -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure)
add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure extern_paddle paddle_fluid)
if (WITH_TRT)
add_definitions(-DWITH_TRT)
endif()
target_link_libraries(pdserving
brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz paddle_fluid ${paddle_depend_libs})
# install
install(TARGETS pdserving
RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/bin
......
......@@ -20,10 +20,9 @@
#include <utility>
#include <vector>
#include "core/predictor/common/inner_common.h"
#include "core/predictor/framework/bsf.h"
#include "core/predictor/framework/factory.h"
#include "core/predictor/framework/infer_data.h"
#include "paddle_inference_api.h" // NOLINT
namespace baidu {
namespace paddle_serving {
namespace predictor {
......@@ -39,6 +38,8 @@ class InferEngineCreationParams {
_static_optimization = false;
_force_update_static_cache = false;
_use_trt = false;
_use_lite = false;
_use_xpu = false;
}
void set_path(const std::string& path) { _path = path; }
......@@ -53,6 +54,10 @@ class InferEngineCreationParams {
void set_use_trt(bool use_trt) { _use_trt = use_trt; }
void set_use_lite(bool use_lite) { _use_lite = use_lite; }
void set_use_xpu(bool use_xpu) { _use_xpu = use_xpu; }
bool enable_memory_optimization() const {
return _enable_memory_optimization;
}
......@@ -61,6 +66,10 @@ class InferEngineCreationParams {
bool use_trt() const { return _use_trt; }
bool use_lite() const { return _use_lite; }
bool use_xpu() const { return _use_xpu; }
void set_static_optimization(bool static_optimization = false) {
_static_optimization = static_optimization;
}
......@@ -80,6 +89,9 @@ class InferEngineCreationParams {
<< "model_path = " << _path << ", "
<< "enable_memory_optimization = " << _enable_memory_optimization
<< ", "
<< "enable_tensorrt = " << _use_trt << ", "
<< "enable_lite = " << _use_lite << ", "
<< "enable_xpu = " << _use_xpu << ", "
<< "enable_ir_optimization = " << _enable_ir_optimization << ", "
<< "static_optimization = " << _static_optimization << ", "
<< "force_update_static_cache = " << _force_update_static_cache;
......@@ -92,6 +104,8 @@ class InferEngineCreationParams {
bool _static_optimization;
bool _force_update_static_cache;
bool _use_trt;
bool _use_lite;
bool _use_xpu;
};
class InferEngine {
......@@ -105,9 +119,7 @@ class InferEngine {
virtual int thrd_initialize() { return thrd_initialize_impl(); }
virtual int thrd_clear() { return thrd_clear_impl(); }
virtual int thrd_finalize() { return thrd_finalize_impl(); }
virtual int infer(const void* in, void* out, uint32_t batch_size = -1) {
return infer_impl1(in, out, batch_size);
}
virtual int infer() { return infer_impl(); }
virtual int reload() = 0;
......@@ -120,11 +132,13 @@ class InferEngine {
virtual int thrd_finalize_impl() = 0;
virtual int thrd_clear_impl() = 0;
virtual int proc_finalize_impl() = 0;
virtual int infer_impl1(const void* in,
void* out,
uint32_t batch_size = -1) = 0;
virtual int infer_impl2(const BatchTensor& in,
BatchTensor& out) = 0; // NOLINT
virtual std::vector<std::string> GetInputNames() = 0;
virtual std::vector<std::string> GetOutputNames() = 0;
virtual std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
const std::string& name) = 0;
virtual std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
const std::string& name) = 0;
virtual int infer_impl() = 0;
// end: framework inner call
};
......@@ -138,8 +152,6 @@ class ReloadableInferEngine : public InferEngine {
uint64_t last_revision;
};
typedef im::bsf::Task<Tensor, Tensor> TaskT;
virtual int load(const InferEngineCreationParams& params) = 0;
int proc_initialize_impl(const configure::EngineDesc& conf, bool version) {
......@@ -182,6 +194,14 @@ class ReloadableInferEngine : public InferEngine {
_infer_engine_params.set_use_trt(conf.use_trt());
}
if (conf.has_use_lite()) {
_infer_engine_params.set_use_lite(conf.use_lite());
}
if (conf.has_use_xpu()) {
_infer_engine_params.set_use_xpu(conf.use_xpu());
}
if (!check_need_reload() || load(_infer_engine_params) != 0) {
LOG(ERROR) << "Failed load model_data_path" << _model_data_path;
return -1;
......@@ -201,45 +221,10 @@ class ReloadableInferEngine : public InferEngine {
LOG(ERROR) << "Failed proc initialize impl";
return -1;
}
// init bsf framework
if (_infer_thread_num <= 0) {
return 0;
}
im::bsf::TaskExecutor<TaskT>::instance()->set_thread_init_fn(
boost::bind(&InferEngine::thrd_initialize_impl, this));
im::bsf::TaskExecutor<TaskT>::instance()->set_thread_reset_fn(
boost::bind(&InferEngine::thrd_clear_impl, this));
im::bsf::TaskExecutor<TaskT>::instance()->set_thread_callback_fn(
boost::bind(&InferEngine::infer_impl2, this, _1, _2));
im::bsf::TaskExecutor<TaskT>::instance()->set_batch_size(_infer_batch_size);
im::bsf::TaskExecutor<TaskT>::instance()->set_batch_align(
_infer_batch_align);
if (im::bsf::TaskExecutor<TaskT>::instance()->start(_infer_thread_num) !=
0) {
LOG(ERROR) << "Failed start bsf executor, threads:" << _infer_thread_num;
return -1;
}
LOG(WARNING) << "Enable batch schedule framework, thread_num:"
<< _infer_thread_num << ", batch_size:" << _infer_batch_size
<< ", enable_batch_align:" << _infer_batch_align;
return 0;
}
int infer(const void* in, void* out, uint32_t batch_size = -1) {
if (_infer_thread_num <= 0) {
return infer_impl1(in, out, batch_size);
}
im::bsf::TaskManager<Tensor, Tensor> task_manager;
task_manager.schedule(*(reinterpret_cast<const BatchTensor*>(in)),
*(reinterpret_cast<BatchTensor*>(out)));
task_manager.wait();
return 0;
}
int infer() { return infer_impl(); }
int thrd_initialize() {
if (_infer_thread_num > 0) {
......@@ -263,10 +248,6 @@ class ReloadableInferEngine : public InferEngine {
return -1;
}
if (_infer_thread_num > 0) {
im::bsf::TaskExecutor<TaskT>::instance()->stop();
}
return 0;
}
......@@ -417,10 +398,6 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
virtual int thrd_initialize_impl() {
// memory pool to be inited in non-serving-threads
if (MempoolWrapper::instance().thread_initialize() != 0) {
LOG(ERROR) << "Failed thread initialize mempool";
return -1;
}
ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
if (!md || load_data(md, _infer_engine_params) != 0) {
......@@ -430,17 +407,12 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
}
THREAD_SETSPECIFIC(_skey, md);
im::bsf::AutoMutex lock(_mutex);
_reload_vec.push_back(md);
return 0;
}
int thrd_clear_impl() {
// for non-serving-threads
if (MempoolWrapper::instance().thread_clear() != 0) {
LOG(ERROR) << "Failed thread clear mempool";
return -1;
}
return 0;
}
......@@ -538,12 +510,6 @@ class CloneDBReloadableInferEngine
}
virtual int thrd_initialize_impl() {
// memory pool to be inited in non-serving-threads
if (MempoolWrapper::instance().thread_initialize() != 0) {
LOG(ERROR) << "Failed thread initialize mempool";
return -1;
}
ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
if (!md || load_data(md, _pd->cores[_pd->current_idx]) != 0) {
LOG(ERROR) << "Failed clone thread data, origin_core["
......@@ -552,7 +518,6 @@ class CloneDBReloadableInferEngine
}
THREAD_SETSPECIFIC(DBReloadableInferEngine<EngineCore>::_skey, md);
im::bsf::AutoMutex lock(DBReloadableInferEngine<EngineCore>::_mutex);
DBReloadableInferEngine<EngineCore>::_reload_vec.push_back(md);
return 0;
}
......@@ -571,8 +536,45 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
public: // NOLINT
FluidInferEngine() {}
~FluidInferEngine() {}
std::vector<std::string> GetInputNames() {
FluidFamilyCore* core =
DBReloadableInferEngine<FluidFamilyCore>::get_core();
if (!core || !core->get()) {
LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
}
return core->GetInputNames();
}
std::vector<std::string> GetOutputNames() {
FluidFamilyCore* core =
DBReloadableInferEngine<FluidFamilyCore>::get_core();
if (!core || !core->get()) {
LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
}
return core->GetOutputNames();
}
std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
const std::string& name) {
FluidFamilyCore* core =
DBReloadableInferEngine<FluidFamilyCore>::get_core();
if (!core || !core->get()) {
LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
}
return core->GetInputHandle(name);
}
std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
const std::string& name) {
FluidFamilyCore* core =
DBReloadableInferEngine<FluidFamilyCore>::get_core();
if (!core || !core->get()) {
LOG(ERROR) << "Failed get fluid core in GetOutputHandle()";
}
return core->GetOutputHandle(name);
}
int infer_impl1(const void* in, void* out, uint32_t batch_size = -1) {
int infer_impl() {
FluidFamilyCore* core =
DBReloadableInferEngine<FluidFamilyCore>::get_core();
if (!core || !core->get()) {
......@@ -580,16 +582,12 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
return -1;
}
if (!core->Run(in, out)) {
if (!core->Run()) {
LOG(ERROR) << "Failed run fluid family core";
return -1;
}
return 0;
}
int infer_impl2(const BatchTensor& in, BatchTensor& out) { // NOLINT
return infer_impl1(&in, &out);
}
};
typedef FactoryPool<InferEngine> StaticInferFactory;
......@@ -715,13 +713,45 @@ class VersionedInferEngine : public InferEngine {
return _versions.begin()->second;
}
int infer(const void* in, void* out, uint32_t batch_size) {
int infer() {
InferEngine* engine = default_engine();
if (!engine) {
LOG(WARNING) << "fail to get default engine";
return -1;
}
return engine->infer(in, out, batch_size);
return engine->infer();
}
std::vector<std::string> GetInputNames() {
InferEngine* engine = default_engine();
if (!engine) {
LOG(WARNING) << "fail to get default engine";
}
return engine->GetInputNames();
}
std::vector<std::string> GetOutputNames() {
InferEngine* engine = default_engine();
if (!engine) {
LOG(WARNING) << "fail to get default engine";
}
return engine->GetOutputNames();
}
std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
const std::string& name) {
InferEngine* engine = default_engine();
if (!engine) {
LOG(WARNING) << "fail to get default engine";
}
return engine->GetInputHandle(name);
}
std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
const std::string& name) {
InferEngine* engine = default_engine();
if (!engine) {
LOG(WARNING) << "fail to get default engine";
}
return engine->GetOutputHandle(name);
}
template <typename T>
......@@ -740,14 +770,47 @@ class VersionedInferEngine : public InferEngine {
}
// versioned inference interface
int infer(const void* in, void* out, uint32_t batch_size, uint64_t version) {
int infer(uint64_t version) {
auto iter = _versions.find(version);
if (iter == _versions.end()) {
LOG(ERROR) << "Not found version engine: " << version;
return -1;
}
return iter->second->infer(in, out, batch_size);
return iter->second->infer();
}
std::vector<std::string> GetInputNames(uint64_t version) {
auto iter = _versions.find(version);
if (iter == _versions.end()) {
LOG(ERROR) << "Not found version engine: " << version;
}
return iter->second->GetInputNames();
}
std::vector<std::string> GetOutputNames(uint64_t version) {
auto iter = _versions.find(version);
if (iter == _versions.end()) {
LOG(ERROR) << "Not found version engine: " << version;
}
return iter->second->GetOutputNames();
}
std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
uint64_t version, const std::string& name) {
auto iter = _versions.find(version);
if (iter == _versions.end()) {
LOG(ERROR) << "Not found version engine: " << version;
}
return iter->second->GetInputHandle(name);
}
std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
uint64_t version, const std::string& name) {
auto iter = _versions.find(version);
if (iter == _versions.end()) {
LOG(ERROR) << "Not found version engine: " << version;
}
return iter->second->GetOutputHandle(name);
}
template <typename T>
......@@ -774,12 +837,7 @@ class VersionedInferEngine : public InferEngine {
int thrd_finalize_impl() { return -1; }
int thrd_clear_impl() { return -1; }
int proc_finalize_impl() { return -1; }
int infer_impl1(const void* in, void* out, uint32_t batch_size = -1) {
return -1;
}
int infer_impl2(const BatchTensor& in, BatchTensor& out) { // NOLINT
return -1;
} // NOLINT
int infer_impl() { return -1; }
private:
boost::unordered_map<uint64_t, InferEngine*> _versions;
......@@ -877,16 +935,44 @@ class InferManager {
}
// Inference interface
int infer(const char* model_name,
const void* in,
void* out,
uint32_t batch_size = -1) {
int infer(const char* model_name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
return -1;
}
return it->second->infer(in, out, batch_size);
return it->second->infer();
}
std::vector<std::string> GetInputNames(const char* model_name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetInputNames();
}
std::vector<std::string> GetOutputNames(const char* model_name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetOutputNames();
}
std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
const char* model_name, const std::string& name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetInputHandle(name);
}
std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
const char* model_name, const std::string& name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetOutputHandle(name);
}
template <typename T>
......@@ -906,19 +992,48 @@ class InferManager {
}
// Versioned inference interface
int infer(const char* model_name,
const void* in,
void* out,
uint32_t batch_size,
uint64_t version) {
int infer(const char* model_name, uint64_t version) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
return -1;
}
return it->second->infer(in, out, batch_size, version);
return it->second->infer(version);
}
std::vector<std::string> GetInputNames(const char* model_name,
uint64_t version) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetInputNames(version);
}
std::vector<std::string> GetOutputNames(const char* model_name,
uint64_t version) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetOutputNames(version);
}
std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
const char* model_name, uint64_t version, const std::string& name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetInputHandle(version, name);
}
std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
const char* model_name, uint64_t version, const std::string& name) {
auto it = _map.find(model_name);
if (it == _map.end()) {
LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
}
return it->second->GetOutputHandle(version, name);
}
template <typename T>
T* get_core(const char* model_name, uint64_t version) {
auto it = _map.find(model_name);
......
......@@ -122,6 +122,7 @@ make -j10
export CUDA_PATH='/usr/local'
export CUDNN_LIBRARY='/usr/local/cuda/lib64/'
export CUDA_CUDART_LIBRARY="/usr/local/cuda/lib64/"
export TENSORRT_LIBRARY_PATH="/usr/local/TensorRT-6.0.1.5/targets/x86_64-linux-gnu/"
mkdir server-build-trt && cd server-build-trt
cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
......
......@@ -19,7 +19,9 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-p
#cuda 10.0
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py3-none-any.whl
#cuda10.1 with TensorRT 6
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py3-none-any.whl
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post101-py3-none-any.whl
#cuda10.2 with TensorRT 7
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post102-py3-none-any.whl
```
### Python 2
```
......@@ -28,7 +30,9 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-p
#cuda 10.0
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py2-none-any.whl
##cuda10.1 with TensorRT 6
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py2-none-any.whl
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post101-py2-none-any.whl
#cuda10.2 with TensorRT 7
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post102-py2-none-any.whl
```
## Client
......
......@@ -676,7 +676,7 @@ service_throughput = 1 / 最慢OP的耗时 * 并发数
service_avg_cost = ∑op_concurrency 【关键路径】
Channel堆积:
channel_acc_size = QPS(down - up) * time
channel_acc_size = QPS(down - up) * time
批量预测平均耗时:
avg_batch_cost = (N * pre + mid + post) / N
......
......@@ -49,4 +49,4 @@ Arguments are the same as `inference_model_to_serving` API.
| `serving_server` | str | `"serving_server"` | The path of model files and configuration files for server. |
| `serving_client` | str | `"serving_client"` | The path of configuration files for client. |
| `model_filename` | str | None | The name of file to load the inference program. If it is None, the default filename `__model__` will be used. |
| `paras_filename` | str | None | The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. |
| `params_filename` | str | None | The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. |
......@@ -50,4 +50,4 @@ python -m paddle_serving_client.convert --dirname ./your_inference_model_dir
| `serving_server` | str | `"serving_server"` | 转换后的模型文件和配置文件的存储路径。默认值为serving_server |
| `serving_client` | str | `"serving_client"` | 转换后的客户端配置文件存储路径。默认值为serving_client |
| `model_filename` | str | None | 存储需要转换的模型Inference Program结构的文件名称。如果设置为None,则使用 `__model__` 作为默认的文件名 |
| `paras_filename` | str | None | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的二进制文件中,它才需要被指定。如果模型参数是存储在各自分离的文件中,设置它的值为None |
| `params_filename` | str | None | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的二进制文件中,它才需要被指定。如果模型参数是存储在各自分离的文件中,设置它的值为None |
......@@ -27,7 +27,7 @@ mvn compile
mvn install
```
### Start the server
### Start the server(not pipeline)
Take the fit_a_line model as an example, the server starts
......@@ -59,6 +59,48 @@ Client prediction
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample yolov4 ../../../python/examples/yolov4/000000570688.jpg
# The case of yolov4 needs to specify a picture as input
```
### Start the server(pipeline)
as for input data type = string,take IMDB model ensemble as an example,the server starts
```
cd ../../python/examples/pipeline/imdb_model_ensemble
sh get_data.sh
python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.log &
python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
python test_pipeline_server.py &>pipeline.log &
```
Client prediction(Synchronous)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample string_imdb_predict
```
Client prediction(Asynchronous)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample asyn_predict
```
as for input data type = INDArray,take uci_housing_model as an example,the server starts
```
cd ../../python/examples/pipeline/simple_web_service
sh get_data.sh
python web_service_java.py &>log.txt &
```
Client prediction(Synchronous)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample indarray_predict
```
### Customization guidance
......@@ -70,6 +112,8 @@ The second is to deploy GPU Serving and Java Client separately. If they are on t
**It should be noted that in the example, all models need to use `--use_multilang` to start GRPC multi-programming language support, and the port number is 9393. If you need another port, you need to modify it in the java file**
**Currently Serving has launched the Pipeline mode (see [Pipeline Serving](../doc/PIPELINE_SERVING.md) for details). The next version (0.4.1) of the Pipeline Serving Client for Java will be released. **
**Currently Serving has launched the Pipeline mode (see [Pipeline Serving](../doc/PIPELINE_SERVING.md) for details). Pipeline Serving Client for Java is released, the next version multi-thread java client example will be released**
**It should be noted that in the example, Java Pipeline Client code is in path /Java/Examples and /Java/src/main, and the Pipeline server code is in path /python/examples/pipeline/**
......@@ -27,7 +27,7 @@ mvn compile
mvn install
```
### 启动服务端
### 启动服务端(非pipeline方式)
以fit_a_line模型为例,服务端启动
......@@ -58,6 +58,49 @@ python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu
# in /Serving/java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample yolov4 ../../../python/examples/yolov4/000000570688.jpg
# yolov4的案例需要指定一个图片作为输入
```
### 启动服务端(Pipeline方式)
对于input data type = string类型,以IMDB model ensemble模型为例,服务端启动
```
cd ../../python/examples/pipeline/imdb_model_ensemble
sh get_data.sh
python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.log &
python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
python test_pipeline_server.py &>pipeline.log &
```
客户端预测(同步)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample string_imdb_predict
```
客户端预测(异步)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample asyn_predict
```
对于input data type = INDArray类型,以Simple Pipeline WebService中的uci_housing_model模型为例,服务端启动
```
cd ../../python/examples/pipeline/simple_web_service
sh get_data.sh
python web_service_java.py &>log.txt &
```
客户端预测(同步)
```
cd ../../../java/examples/target
java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample indarray_predict
```
### 二次开发指导
......@@ -70,6 +113,9 @@ java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar Paddle
**需要注意的是,在示例中,所有模型都需要使用`--use_multilang`来启动GRPC多编程语言支持,以及端口号都是9393,如果需要别的端口,需要在java文件里修改**
**目前Serving已推出Pipeline模式(详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)),下个版本(0.4.1)面向Java的Pipeline Serving Client将会发布,敬请期待。**
**目前Serving已推出Pipeline模式(详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)),面向Java的Pipeline Serving Client已发布,下个更新会发布Java版本的多线程用例敬请期待。**
**需要注意的是,Java Pipeline Client相关示例在/Java/Examples和/Java/src/main中,对应的Pipeline server在/python/examples/pipeline/中**
**目前Serving已推出Pipeline模式(详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)),下个版本(0.4.1)面向Java的Pipeline Serving Client将会发布,敬请期待。**
import io.paddle.serving.pipelineclient.*;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import org.nd4j.linalg.api.iter.NdIndexIterator;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.datavec.image.loader.NativeImageLoader;
import org.nd4j.linalg.api.ops.CustomOp;
import org.nd4j.linalg.api.ops.DynamicCustomOp;
import org.nd4j.linalg.factory.Nd4j;
import java.util.*;
/**
* this class give an example for using the client to predict(grpc)
* StaticPipelineClient.client supports mutil-thread.
* By setting StaticPipelineClient.client properties,you can change the Maximum concurrency
* Do not need to generate multiple instances of client,Use the StaticPipelineClient.client or SingleTon instead.
* @author HexToString
*/
public class PipelineClientExample {
/**
* This method gives an example of synchronous prediction whose input type is string.
*/
boolean string_imdb_predict() {
HashMap<String, String> feed_data
= new HashMap<String, String>() {{
put("words", "i am very sad | 0");
}};
System.out.println(feed_data);
List<String> fetch = Arrays.asList("prediction");
System.out.println(fetch);
if (StaticPipelineClient.succ != true) {
if(!StaticPipelineClient.initClient("172.17.0.2","18070")){
System.out.println("connect failed.");
return false;
}
}
HashMap<String,String> result = StaticPipelineClient.client.predict(feed_data, fetch,false,0);
if (result == null) {
return false;
}
System.out.println(result);
return true;
}
/**
* This method gives an example of asynchronous prediction whose input type is string.
*/
boolean asyn_predict() {
HashMap<String, String> feed_data
= new HashMap<String, String>() {{
put("words", "i am very sad | 0");
}};
System.out.println(feed_data);
List<String> fetch = Arrays.asList("prediction");
System.out.println(fetch);
if (StaticPipelineClient.succ != true) {
if(!StaticPipelineClient.initClient("172.17.0.2","18070")){
System.out.println("connect failed.");
return false;
}
}
PipelineFuture future = StaticPipelineClient.client.asyn_pr::qedict(feed_data, fetch,false,0);
HashMap<String,String> result = future.get();
if (result == null) {
return false;
}
System.out.println(result);
return true;
}
/**
* This method gives an example of synchronous prediction whose input type is Array or list or matrix.
* use Nd4j.createFromArray method to convert Array to INDArray.
* use convertINDArrayToString method to convert INDArray to specified String type(for python Numpy eval method).
*/
boolean indarray_predict() {
float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f, 0.0582f, -0.0727f, -0.1583f, -0.0584f, 0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
INDArray npdata = Nd4j.createFromArray(data);
HashMap<String, String> feed_data
= new HashMap<String, String>() {{
put("x", convertINDArrayToString(npdata));
}};
List<String> fetch = Arrays.asList("prediction");
if (StaticPipelineClient.succ != true) {
if(!StaticPipelineClient.initClient("172.17.0.2","9998")){
System.out.println("connect failed.");
return false;
}
}
HashMap<String,String> result = StaticPipelineClient.client.predict(feed_data, fetch,false,0);
if (result == null) {
return false;
}
System.out.println(result);
return true;
}
/**
* This method convert INDArray to specified String type.
* @param npdata INDArray type(The input data).
* @return String (specified String type for python Numpy eval method).
*/
String convertINDArrayToString(INDArray npdata){
return "array("+npdata.toString()+")";
}
/**
* This method is entry function.
* @param args String[] type(Command line parameters)
*/
public static void main( String[] args ) {
PipelineClientExample e = new PipelineClientExample();
boolean succ = false;
if (args.length < 1) {
System.out.println("Usage: java -cp <jar> PaddleServingClientExample <test-type>.");
System.out.println("<test-type>: fit_a_line bert model_ensemble asyn_predict batch_predict cube_local cube_quant yolov4");
return;
}
String testType = args[0];
System.out.format("[Example] %s\n", testType);
if ("string_imdb_predict".equals(testType)) {
succ = e.string_imdb_predict();
}else if ("asyn_predict".equals(testType)) {
succ = e.asyn_predict();
}else if ("indarray_predict".equals(testType)) {
succ = e.indarray_predict();
} else {
System.out.format("test-type(%s) not match.\n", testType);
return;
}
if (succ == true) {
System.out.println("[Example] succ.");
} else {
System.out.println("[Example] fail.");
}
}
}
import io.paddle.serving.pipelineclient.*;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import org.nd4j.linalg.api.iter.NdIndexIterator;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.datavec.image.loader.NativeImageLoader;
import org.nd4j.linalg.api.ops.CustomOp;
import org.nd4j.linalg.api.ops.DynamicCustomOp;
import org.nd4j.linalg.factory.Nd4j;
import java.util.*;
/**
* static resource management class
* @author HexToString
*/
public class StaticPipelineClient {
/**
* Static Variable PipelineClient
*/
public static PipelineClient client = new PipelineClient();
/**
* the sign of connect status
*/
public static boolean succ = false;
/**
* This method returns the sign of connect status.
* @param strIp String type(The server ipv4) such as "192.168.10.10".
* @param strPort String type(The server port) such as "8891".
* @return boolean (the sign of connect status).
*/
public static boolean initClient(String strIp,String strPort){
String target = strIp+ ":"+ strPort;//"172.17.0.2:18070";
System.out.println("initial connect.");
if(succ){
System.out.println("already connect.");
return true;
}
succ = clieint.connect(target);
if (succ != true) {
System.out.println("connect failed.");
return false;
}
return true;
}
}
package io.paddle.serving.pipelineclient;
import java.util.*;
import java.util.function.Function;
import java.lang.management.ManagementFactory;
import java.lang.management.RuntimeMXBean;
import io.grpc.ManagedChannel;
import io.grpc.ManagedChannelBuilder;
import io.grpc.StatusRuntimeException;
import com.google.protobuf.ByteString;
import com.google.common.util.concurrent.ListenableFuture;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.iter.NdIndexIterator;
import org.nd4j.linalg.factory.Nd4j;
import io.paddle.serving.pipelineproto.*;
import io.paddle.serving.pipelineclient.PipelineFuture;
/**
* PipelineClient class defination
* @author HexToString
*/
public class PipelineClient {
private ManagedChannel channel_;
private PipelineServiceGrpc.PipelineServiceBlockingStub blockingStub_;
private PipelineServiceGrpc.PipelineServiceFutureStub futureStub_;
private String clientip;
private String _profile_key;
private String _profile_value;
public PipelineClient() {
channel_ = null;
blockingStub_ = null;
futureStub_ = null;
boolean is_profile = false;
clientip = null;
_profile_value = "1";
_profile_key = "pipeline.profile";
}
/**
* This method returns the sign of connect status.
* @param target String type(The server ipv4 and port) such as "192.168.10.10:8891".
* @return boolean (the sign of connect status).
*/
public boolean connect(String target) {
try {
String[] temp = target.split(":");
this.clientip = temp[0] == "localhost"?"127.0.0.1":temp[0];
channel_ = ManagedChannelBuilder.forTarget(target)
.defaultLoadBalancingPolicy("round_robin")
.maxInboundMessageSize(Integer.MAX_VALUE)
.usePlaintext()
.build();
blockingStub_ = PipelineServiceGrpc.newBlockingStub(channel_);
futureStub_ = PipelineServiceGrpc.newFutureStub(channel_);
} catch (Exception e) {
System.out.format("Connect failed: %s\n", e.toString());
return false;
}
return true;
}
/**
* This method returns the Packaged Request.
* @param feed_dict HashMap<String, String>(input data).
* @param profile boolean(profile sign).
* @param logid int
* @return Request (the grpc protobuf Request).
*/
private Request _packInferenceRequest(
HashMap<String, String> feed_dict,
boolean profile,
int logid) throws IllegalArgumentException {
List<String> keys = new ArrayList<String>();
List<String> values = new ArrayList<String>();
long[] flattened_shape = {-1};
Request.Builder req_builder = Request.newBuilder()
.setClientip(this.clientip)
.setLogid(logid);
for (Map.Entry<String, String> entry : feed_dict.entrySet()) {
keys.add(entry.getKey());
values.add(entry.getValue());
}
if(profile){
keys.add(_profile_key);
values.add(_profile_value);
}
req_builder.addAllKey(keys);
req_builder.addAllValue(values);
return req_builder.build();
}
/**
* This method returns the HashMap which is unpackaged from Response.
* @param resp Response(the grpc protobuf Response).
* @return HashMap<String,String> (the output).
*/
private HashMap<String,String> _unpackResponse(Response resp) throws IllegalArgumentException{
return PipelineClient._staitcUnpackResponse(resp);
}
/**
* This static method returns the HashMap which is unpackaged from Response.
* @param resp Response(the grpc protobuf Response).
* @return HashMap<String,String> (the output).
*/
private static HashMap<String,String> _staitcUnpackResponse(Response resp) {
HashMap<String,String> ret_Map = new HashMap<String,String>();
int err_no = resp.getErrNo();
if ( err_no!= 0) {
return null;
}
List<String> keys = resp.getKeyList();
List<String> values= resp.getValueList();
for (int i = 0;i<keys.size();i++) {
ret_Map.put(keys.get(i),values.get(i));
}
return ret_Map;
}
/**
* The synchronous prediction method.
* @param feed_batch HashMap<String, String>(input data).
* @param fetch Iterable<String>(the output key list).
* @param profile boolean(profile sign).
* @param logid int
* @return HashMap<String,String> (the output).
*/
public HashMap<String,String> predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
boolean profile,
int logid) {
try {
Request req = _packInferenceRequest(
feed_batch, profile,logid);
Response resp = blockingStub_.inference(req);
return _unpackResponse(resp);
} catch (StatusRuntimeException e) {
System.out.format("Failed to predict: %s\n", e.toString());
return null;
}
}
/**
* The synchronous prediction overload function.
*/
public HashMap<String,String> predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch) {
return predict(feed_batch,fetch,false,0);
}
/**
* The synchronous prediction overload function.
*/
public HashMap<String,String> predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
boolean profile) {
return predict(feed_batch,fetch,profile,0);
}
/**
* The synchronous prediction overload function.
*/
public HashMap<String,String> predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
int logid) {
return predict(feed_batch,fetch,false,logid);
}
/**
* The asynchronous prediction method.use future.get() to get the result.
* @param feed_batch HashMap<String, String>(input data).
* @param fetch Iterable<String>(the output key list).
* @param profile boolean(profile sign).
* @param logid int
* @return PipelineFuture(the output future).
*/
public PipelineFuture asyn_predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
boolean profile,
int logid) {
Request req = _packInferenceRequest(
feed_batch, profile, logid);
ListenableFuture<Response> future = futureStub_.inference(req);
PipelineFuture predict_future = new PipelineFuture(future,
(Response resp) -> {
return PipelineClient._staitcUnpackResponse(resp);
}
);
return predict_future;
}
/**
* The asynchronous prediction overload function.
*/
public PipelineFuture asyn_predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch) {
return asyn_predict(feed_batch,fetch,false,0);
}
/**
* The asynchronous prediction overload function.
*/
public PipelineFuture asyn_predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
boolean profile) {
return asyn_predict(feed_batch,fetch,profile,0);
}
/**
* The asynchronous prediction overload function.
*/
public PipelineFuture asyn_predict(
HashMap<String, String> feed_batch,
Iterable<String> fetch,
int logid) {
return asyn_predict(feed_batch,fetch,false,logid);
}
}
package io.paddle.serving.pipelineclient;
import java.util.*;
import java.util.function.Function;
import io.grpc.StatusRuntimeException;
import com.google.common.util.concurrent.ListenableFuture;
import org.nd4j.linalg.api.ndarray.INDArray;
import io.paddle.serving.pipelineclient.PipelineClient;
import io.paddle.serving.pipelineproto.*;
/**
* PipelineFuture class is for asynchronous prediction
* @author HexToString
*/
public class PipelineFuture {
private ListenableFuture<Response> callFuture_;
private Function<Response,
HashMap<String,String> > callBackFunc_;
PipelineFuture(ListenableFuture<Response> call_future,
Function<Response,
HashMap<String,String> > call_back_func) {
callFuture_ = call_future;
callBackFunc_ = call_back_func;
}
/**
* use this method to get the result of asynchronous prediction.
*/
public HashMap<String,String> get() {
Response resp = null;
try {
resp = callFuture_.get();
} catch (Exception e) {
System.out.format("predict failed: %s\n", e.toString());
return null;
}
HashMap<String,String> result
= callBackFunc_.apply(resp);
return result;
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
option java_multiple_files = true;
option java_package = "io.paddle.serving.pipelineproto";
option java_outer_classname = "PipelineProto";
package baidu.paddle_serving.pipeline_serving;
message Request {
repeated string key = 1;
repeated string value = 2;
optional string name = 3;
optional string method = 4;
optional int64 logid = 5;
optional string clientip = 6;
};
message Response {
optional int32 err_no = 1;
optional string err_msg = 2;
repeated string key = 3;
repeated string value = 4;
};
service PipelineService {
rpc inference(Request) returns (Response) {}
};
......@@ -13,8 +13,13 @@
# limitations under the License
if (NOT CLIENT_ONLY)
add_subdirectory(inferencer-fluid-cpu)
if (WITH_GPU)
add_subdirectory(inferencer-fluid-gpu)
endif()
add_subdirectory(inferencer-fluid-cpu)
if (WITH_GPU)
add_subdirectory(inferencer-fluid-gpu)
endif()
if (WITH_LITE)
add_subdirectory(inferencer-fluid-arm)
endif()
endif()
FILE(GLOB fluid_arm_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
add_library(fluid_arm_engine ${fluid_arm_engine_srcs})
target_include_directories(fluid_arm_engine PUBLIC
${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
add_dependencies(fluid_arm_engine pdserving extern_paddle configure)
target_link_libraries(fluid_arm_engine pdserving paddle_fluid -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
install(TARGETS fluid_arm_engine
ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pthread.h>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include "core/configure/include/configure_parser.h"
#include "core/configure/inferencer_configure.pb.h"
#include "core/predictor/framework/infer.h"
#include "paddle_inference_api.h" // NOLINT
namespace baidu {
namespace paddle_serving {
namespace fluid_arm {
class AutoLock {
public:
explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
pthread_mutex_lock(&mutex);
}
~AutoLock() { pthread_mutex_unlock(&_mut); }
private:
pthread_mutex_t& _mut;
};
class GlobalPaddleCreateMutex {
public:
pthread_mutex_t& mutex() { return _mut; }
static pthread_mutex_t& instance() {
static GlobalPaddleCreateMutex gmutex;
return gmutex.mutex();
}
private:
GlobalPaddleCreateMutex() { pthread_mutex_init(&_mut, NULL); }
pthread_mutex_t _mut;
};
using paddle_infer::Config;
using paddle_infer::Predictor;
using paddle_infer::Tensor;
using paddle_infer::PrecisionType;
using paddle_infer::CreatePredictor;
// data interface
class FluidFamilyCore {
public:
virtual ~FluidFamilyCore() {}
virtual std::vector<std::string> GetInputNames() {
return _core->GetInputNames();
}
virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
return _core->GetInputHandle(name);
}
virtual std::vector<std::string> GetOutputNames() {
return _core->GetOutputNames();
}
virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
return _core->GetOutputHandle(name);
}
virtual bool Run() {
if (!_core->Run()) {
LOG(ERROR) << "Failed call Run with paddle predictor";
return false;
}
return true;
}
virtual int create(const predictor::InferEngineCreationParams& params) = 0;
virtual int clone(void* origin_core) {
if (origin_core == NULL) {
LOG(ERROR) << "origin paddle Predictor is null.";
return -1;
}
Predictor* p_predictor = (Predictor*)origin_core;
_core = p_predictor->Clone();
if (_core.get() == NULL) {
LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
return -1;
}
return 0;
}
virtual void* get() { return _core.get(); }
protected:
std::shared_ptr<Predictor> _core;
};
// infer interface
class FluidArmAnalysisCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
Config config;
config.SetParamsFile(data_path + "/__params__");
config.SetProgFile(data_path + "/__model__");
config.DisableGpu();
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
config.EnableMemoryOptim();
}
if (params.enable_memory_optimization()) {
config.EnableMemoryOptim();
}
if (params.use_lite()) {
config.EnableLiteEngine(PrecisionType::kFloat32, true);
}
if (params.use_xpu()) {
config.EnableXpu(100);
}
config.SwitchSpecifyInputNames(true);
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidArmAnalysisDirCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
Config config;
config.SetModel(data_path);
config.DisableGpu();
config.SwitchSpecifyInputNames(true);
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
config.EnableMemoryOptim();
}
if (params.enable_ir_optimization()) {
config.SwitchIrOptim(true);
} else {
config.SwitchIrOptim(false);
}
if (params.use_lite()) {
config.EnableLiteEngine(PrecisionType::kFloat32, true);
}
if (params.use_xpu()) {
config.EnableXpu(100);
}
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class Parameter {
public:
Parameter() : _row(0), _col(0), _params(NULL) {}
~Parameter() {
VLOG(2) << "before destroy Parameter, file_name[" << _file_name << "]";
destroy();
}
int init(int row, int col, const char* file_name) {
destroy();
_file_name = file_name;
_row = row;
_col = col;
_params = reinterpret_cast<float*>(malloc(_row * _col * sizeof(float)));
if (_params == NULL) {
LOG(ERROR) << "Load " << _file_name << " malloc error.";
return -1;
}
VLOG(2) << "Load parameter file[" << _file_name << "] success.";
return 0;
}
void destroy() {
_row = 0;
_col = 0;
if (_params != NULL) {
free(_params);
_params = NULL;
}
}
int load() {
if (_params == NULL || _row <= 0 || _col <= 0) {
LOG(ERROR) << "load parameter error [not inited].";
return -1;
}
FILE* fs = fopen(_file_name.c_str(), "rb");
if (fs == NULL) {
LOG(ERROR) << "load " << _file_name << " fopen error.";
return -1;
}
static const uint32_t MODEL_FILE_HEAD_LEN = 16;
char head[MODEL_FILE_HEAD_LEN] = {0};
if (fread(head, 1, MODEL_FILE_HEAD_LEN, fs) != MODEL_FILE_HEAD_LEN) {
destroy();
LOG(ERROR) << "Load " << _file_name << " read head error.";
if (fs != NULL) {
fclose(fs);
fs = NULL;
}
return -1;
}
uint32_t matrix_size = _row * _col;
if (matrix_size == fread(_params, sizeof(float), matrix_size, fs)) {
if (fs != NULL) {
fclose(fs);
fs = NULL;
}
VLOG(2) << "load " << _file_name << " read ok.";
return 0;
} else {
LOG(ERROR) << "load " << _file_name << " read error.";
destroy();
if (fs != NULL) {
fclose(fs);
fs = NULL;
}
return -1;
}
return 0;
}
public:
std::string _file_name;
int _row;
int _col;
float* _params;
};
} // namespace fluid_arm
} // namespace paddle_serving
} // namespace baidu
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h"
#include "core/predictor/framework/factory.h"
namespace baidu {
namespace paddle_serving {
namespace fluid_arm {
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<FluidArmAnalysisCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_ARM_ANALYSIS");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<
FluidArmAnalysisDirCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_ARM_ANALYSIS_DIR");
} // namespace fluid_arm
} // namespace paddle_serving
} // namespace baidu
......@@ -28,8 +28,6 @@ namespace baidu {
namespace paddle_serving {
namespace fluid_cpu {
using configure::SigmoidConf;
class AutoLock {
public:
explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
......@@ -57,31 +55,36 @@ class GlobalPaddleCreateMutex {
pthread_mutex_t _mut;
};
class GlobalSigmoidCreateMutex {
public:
pthread_mutex_t& mutex() { return _mut; }
static pthread_mutex_t& instance() {
static GlobalSigmoidCreateMutex gmutex;
return gmutex.mutex();
}
private:
GlobalSigmoidCreateMutex() { pthread_mutex_init(&_mut, NULL); }
pthread_mutex_t _mut;
};
using paddle_infer::Config;
using paddle_infer::Predictor;
using paddle_infer::Tensor;
using paddle_infer::CreatePredictor;
// data interface
class FluidFamilyCore {
public:
virtual ~FluidFamilyCore() {}
virtual bool Run(const void* in_data, void* out_data) {
if (!_core->Run(*(std::vector<paddle::PaddleTensor>*)in_data,
(std::vector<paddle::PaddleTensor>*)out_data)) {
virtual std::vector<std::string> GetInputNames() {
return _core->GetInputNames();
}
virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
return _core->GetInputHandle(name);
}
virtual std::vector<std::string> GetOutputNames() {
return _core->GetOutputNames();
}
virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
return _core->GetOutputHandle(name);
}
virtual bool Run() {
if (!_core->Run()) {
LOG(ERROR) << "Failed call Run with paddle predictor";
return false;
}
return true;
}
......@@ -92,8 +95,7 @@ class FluidFamilyCore {
LOG(ERROR) << "origin paddle Predictor is null.";
return -1;
}
paddle::PaddlePredictor* p_predictor =
(paddle::PaddlePredictor*)origin_core;
Predictor* p_predictor = (Predictor*)origin_core;
_core = p_predictor->Clone();
if (_core.get() == NULL) {
LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
......@@ -105,7 +107,7 @@ class FluidFamilyCore {
virtual void* get() { return _core.get(); }
protected:
std::unique_ptr<paddle::PaddlePredictor> _core;
std::shared_ptr<Predictor> _core;
};
// infer interface
......@@ -119,51 +121,19 @@ class FluidCpuAnalysisCore : public FluidFamilyCore {
return -1;
}
paddle::AnalysisConfig analysis_config;
analysis_config.SetParamsFile(data_path + "/__params__");
analysis_config.SetProgFile(data_path + "/__model__");
analysis_config.DisableGpu();
analysis_config.SetCpuMathLibraryNumThreads(1);
Config config;
config.SetParamsFile(data_path + "/__params__");
config.SetProgFile(data_path + "/__model__");
config.DisableGpu();
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
config.EnableMemoryOptim();
}
analysis_config.SwitchSpecifyInputNames(true);
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidCpuNativeCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::NativeConfig native_config;
native_config.param_file = data_path + "/__params__";
native_config.prog_file = data_path + "/__model__";
native_config.use_gpu = false;
native_config.device = 0;
native_config.fraction_of_gpu_memory = 0;
config.SwitchSpecifyInputNames(true);
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
......@@ -184,54 +154,24 @@ class FluidCpuAnalysisDirCore : public FluidFamilyCore {
return -1;
}
paddle::AnalysisConfig analysis_config;
analysis_config.SetModel(data_path);
analysis_config.DisableGpu();
analysis_config.SwitchSpecifyInputNames(true);
analysis_config.SetCpuMathLibraryNumThreads(1);
Config config;
config.SetModel(data_path);
config.DisableGpu();
config.SwitchSpecifyInputNames(true);
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
config.EnableMemoryOptim();
}
if (params.enable_ir_optimization()) {
analysis_config.SwitchIrOptim(true);
config.SwitchIrOptim(true);
} else {
analysis_config.SwitchIrOptim(false);
config.SwitchIrOptim(false);
}
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidCpuNativeDirCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::NativeConfig native_config;
native_config.model_dir = data_path;
native_config.use_gpu = false;
native_config.device = 0;
native_config.fraction_of_gpu_memory = 0;
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
......@@ -323,214 +263,6 @@ class Parameter {
float* _params;
};
class SigmoidModel {
public:
~SigmoidModel() {}
int load(const char* sigmoid_w_file,
const char* sigmoid_b_file,
float exp_max,
float exp_min) {
AutoLock lock(GlobalSigmoidCreateMutex::instance());
if (0 != _sigmoid_w.init(2, 1, sigmoid_w_file) || 0 != _sigmoid_w.load()) {
LOG(ERROR) << "load params sigmoid_w failed.";
return -1;
}
VLOG(2) << "load sigmoid_w [" << _sigmoid_w._params[0] << "] ["
<< _sigmoid_w._params[1] << "].";
if (0 != _sigmoid_b.init(2, 1, sigmoid_b_file) || 0 != _sigmoid_b.load()) {
LOG(ERROR) << "load params sigmoid_b failed.";
return -1;
}
VLOG(2) << "load sigmoid_b [" << _sigmoid_b._params[0] << "] ["
<< _sigmoid_b._params[1] << "].";
_exp_max_input = exp_max;
_exp_min_input = exp_min;
return 0;
}
int softmax(float x, double& o) { // NOLINT
float _y0 = x * _sigmoid_w._params[0] + _sigmoid_b._params[0];
float _y1 = x * _sigmoid_w._params[1] + _sigmoid_b._params[1];
_y0 = (_y0 > _exp_max_input)
? _exp_max_input
: ((_y0 < _exp_min_input) ? _exp_min_input : _y0);
_y1 = (_y1 > _exp_max_input)
? _exp_max_input
: ((_y1 < _exp_min_input) ? _exp_min_input : _y1);
o = 1.0f / (1.0f + exp(_y0 - _y1));
return 0;
}
public:
Parameter _sigmoid_w;
Parameter _sigmoid_b;
float _exp_max_input;
float _exp_min_input;
};
class SigmoidFluidModel {
public:
int softmax(float x, double& o) { // NOLINT
return _sigmoid_core->softmax(x, o);
} // NOLINT
std::unique_ptr<SigmoidFluidModel> Clone() {
std::unique_ptr<SigmoidFluidModel> clone_model;
clone_model.reset(new SigmoidFluidModel());
clone_model->_sigmoid_core = _sigmoid_core;
clone_model->_fluid_core = _fluid_core->Clone();
return std::move(clone_model); // NOLINT
}
public:
std::unique_ptr<paddle::PaddlePredictor> _fluid_core;
std::shared_ptr<SigmoidModel> _sigmoid_core;
};
class FluidCpuWithSigmoidCore : public FluidFamilyCore {
public:
virtual ~FluidCpuWithSigmoidCore() {}
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string model_path = params.get_path();
size_t pos = model_path.find_last_of("/\\");
std::string conf_path = model_path.substr(0, pos);
std::string conf_file = model_path.substr(pos);
configure::SigmoidConf conf;
if (configure::read_proto_conf(conf_path, conf_file, &conf) != 0) {
LOG(ERROR) << "failed load model path: " << model_path;
return -1;
}
_core.reset(new SigmoidFluidModel);
std::string fluid_model_data_path = conf.dnn_model_path();
predictor::InferEngineCreationParams new_params(params);
new_params.set_path(fluid_model_data_path);
int ret = load_fluid_model(new_params);
if (ret < 0) {
LOG(ERROR) << "fail to load fluid model.";
return -1;
}
const char* sigmoid_w_file = conf.sigmoid_w_file().c_str();
const char* sigmoid_b_file = conf.sigmoid_b_file().c_str();
float exp_max = conf.exp_max_input();
float exp_min = conf.exp_min_input();
_core->_sigmoid_core.reset(new SigmoidModel);
VLOG(2) << "create sigmoid core[" << _core->_sigmoid_core.get()
<< "], use count[" << _core->_sigmoid_core.use_count() << "].";
ret = _core->_sigmoid_core->load(
sigmoid_w_file, sigmoid_b_file, exp_max, exp_min);
if (ret < 0) {
LOG(ERROR) << "fail to load sigmoid model.";
return -1;
}
return 0;
}
virtual bool Run(const void* in_data, void* out_data) {
if (!_core->_fluid_core->Run(
*(std::vector<paddle::PaddleTensor>*)in_data,
(std::vector<paddle::PaddleTensor>*)out_data)) {
LOG(ERROR) << "Failed call Run with paddle predictor";
return false;
}
return true;
}
virtual int clone(SigmoidFluidModel* origin_core) {
if (origin_core == NULL) {
LOG(ERROR) << "origin paddle Predictor is null.";
return -1;
}
_core = origin_core->Clone();
if (_core.get() == NULL) {
LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
return -1;
}
VLOG(2) << "clone sigmoid core[" << _core->_sigmoid_core.get()
<< "] use count[" << _core->_sigmoid_core.use_count() << "].";
return 0;
}
virtual SigmoidFluidModel* get() { return _core.get(); }
virtual int load_fluid_model(
const predictor::InferEngineCreationParams& params) = 0;
int softmax(float x, double& o) { // NOLINT
return _core->_sigmoid_core->softmax(x, o);
}
protected:
std::unique_ptr<SigmoidFluidModel> _core; // NOLINT
};
class FluidCpuNativeDirWithSigmoidCore : public FluidCpuWithSigmoidCore {
public:
int load_fluid_model(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::NativeConfig native_config;
native_config.model_dir = data_path;
native_config.use_gpu = false;
native_config.device = 0;
native_config.fraction_of_gpu_memory = 0;
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core->_fluid_core =
paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidCpuAnalysisDirWithSigmoidCore : public FluidCpuWithSigmoidCore {
public:
int load_fluid_model(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::AnalysisConfig analysis_config;
analysis_config.SetModel(data_path);
analysis_config.DisableGpu();
analysis_config.SwitchSpecifyInputNames(true);
analysis_config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
}
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core->_fluid_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
} // namespace fluid_cpu
} // namespace paddle_serving
} // namespace baidu
......@@ -30,28 +30,6 @@ REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_CPU_ANALYSIS_DIR");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<
FluidCpuAnalysisDirWithSigmoidCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_CPU_ANALYSIS_DIR_SIGMOID");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<FluidCpuNativeCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_CPU_NATIVE");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<FluidCpuNativeDirCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_CPU_NATIVE_DIR");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<
FluidCpuNativeDirWithSigmoidCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_CPU_NATIVE_DIR_SIGMOID");
} // namespace fluid_cpu
} // namespace paddle_serving
} // namespace baidu
......@@ -61,31 +61,36 @@ class GlobalPaddleCreateMutex {
pthread_mutex_t _mut;
};
class GlobalSigmoidCreateMutex {
public:
pthread_mutex_t& mutex() { return _mut; }
static pthread_mutex_t& instance() {
static GlobalSigmoidCreateMutex gmutex;
return gmutex.mutex();
}
private:
GlobalSigmoidCreateMutex() { pthread_mutex_init(&_mut, NULL); }
pthread_mutex_t _mut;
};
using paddle_infer::Config;
using paddle_infer::Predictor;
using paddle_infer::Tensor;
using paddle_infer::CreatePredictor;
// data interface
class FluidFamilyCore {
public:
virtual ~FluidFamilyCore() {}
virtual bool Run(const void* in_data, void* out_data) {
if (!_core->Run(*(std::vector<paddle::PaddleTensor>*)in_data,
(std::vector<paddle::PaddleTensor>*)out_data)) {
virtual std::vector<std::string> GetInputNames() {
return _core->GetInputNames();
}
virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
return _core->GetInputHandle(name);
}
virtual std::vector<std::string> GetOutputNames() {
return _core->GetOutputNames();
}
virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
return _core->GetOutputHandle(name);
}
virtual bool Run() {
if (!_core->Run()) {
LOG(ERROR) << "Failed call Run with paddle predictor";
return false;
}
return true;
}
......@@ -96,8 +101,7 @@ class FluidFamilyCore {
LOG(ERROR) << "origin paddle Predictor is null.";
return -1;
}
paddle::PaddlePredictor* p_predictor =
(paddle::PaddlePredictor*)origin_core;
Predictor* p_predictor = (Predictor*)origin_core;
_core = p_predictor->Clone();
if (_core.get() == NULL) {
LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
......@@ -109,7 +113,7 @@ class FluidFamilyCore {
virtual void* get() { return _core.get(); }
protected:
std::unique_ptr<paddle::PaddlePredictor> _core;
std::shared_ptr<Predictor> _core;
};
// infer interface
......@@ -123,51 +127,19 @@ class FluidGpuAnalysisCore : public FluidFamilyCore {
return -1;
}
paddle::AnalysisConfig analysis_config;
analysis_config.SetParamsFile(data_path + "/__params__");
analysis_config.SetProgFile(data_path + "/__model__");
analysis_config.EnableUseGpu(100, FLAGS_gpuid);
analysis_config.SetCpuMathLibraryNumThreads(1);
Config config;
config.SetParamsFile(data_path + "/__params__");
config.SetProgFile(data_path + "/__model__");
config.EnableUseGpu(100, FLAGS_gpuid);
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
config.EnableMemoryOptim();
}
analysis_config.SwitchSpecifyInputNames(true);
config.SwitchSpecifyInputNames(true);
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidGpuNativeCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::NativeConfig native_config;
native_config.param_file = data_path + "/__params__";
native_config.prog_file = data_path + "/__model__";
native_config.use_gpu = true;
native_config.fraction_of_gpu_memory = 0.01;
native_config.device = FLAGS_gpuid;
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
......@@ -188,110 +160,38 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
return -1;
}
paddle::AnalysisConfig analysis_config;
analysis_config.SetModel(data_path);
analysis_config.EnableUseGpu(1500, FLAGS_gpuid);
analysis_config.SwitchSpecifyInputNames(true);
analysis_config.SetCpuMathLibraryNumThreads(1);
Config config;
config.SetModel(data_path);
config.EnableUseGpu(1500, FLAGS_gpuid);
config.SwitchSpecifyInputNames(true);
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
config.EnableMemoryOptim();
}
#if 0 // todo: support flexible shape
int min_seq_len = 1;
int max_seq_len = 512;
int opt_seq_len = 128;
int head_number = 12;
int batch = 50;
std::vector<int> min_in_shape = {batch, min_seq_len, 1};
std::vector<int> max_in_shape = {batch, max_seq_len, 1};
std::vector<int> opt_in_shape = {batch, opt_seq_len, 1};
std::string input1_name = "src_text_a_ids";
std::string input2_name = "pos_text_a_ids";
std::string input3_name = "sent_text_a_ids";
std::string input4_name = "stack_0.tmp_0";
std::map<std::string, std::vector<int>> min_input_shape = {
{input1_name, min_in_shape},
{input2_name, min_in_shape},
{input3_name, min_in_shape},
{input4_name, {batch, head_number, min_seq_len, min_seq_len}},
};
std::map<std::string, std::vector<int>> max_input_shape = {
{input1_name, max_in_shape},
{input2_name, max_in_shape},
{input3_name, max_in_shape},
{input4_name, {batch, head_number, max_seq_len, max_seq_len}},
};
std::map<std::string, std::vector<int>> opt_input_shape = {
{input1_name, opt_in_shape},
{input2_name, opt_in_shape},
{input3_name, opt_in_shape},
{input4_name, {batch, head_number, opt_seq_len, opt_seq_len}},
};
analysis_config.SetTRTDynamicShapeInfo(
min_input_shape, max_input_shape, opt_input_shape);
#endif
int max_batch = 32;
int min_subgraph_size = 3;
if (params.use_trt()) {
analysis_config.EnableTensorRtEngine(
1 << 20,
max_batch,
min_subgraph_size,
paddle::AnalysisConfig::Precision::kFloat32,
false,
false);
config.EnableTensorRtEngine(1 << 20,
max_batch,
min_subgraph_size,
Config::Precision::kFloat32,
false,
false);
LOG(INFO) << "create TensorRT predictor";
} else {
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
config.EnableMemoryOptim();
}
if (params.enable_ir_optimization()) {
analysis_config.SwitchIrOptim(true);
config.SwitchIrOptim(true);
} else {
analysis_config.SwitchIrOptim(false);
config.SwitchIrOptim(false);
}
}
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidGpuNativeDirCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::NativeConfig native_config;
native_config.model_dir = data_path;
native_config.use_gpu = true;
native_config.fraction_of_gpu_memory = 0.01;
native_config.device = FLAGS_gpuid;
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
_core = CreatePredictor(config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
......@@ -383,214 +283,6 @@ class Parameter {
float* _params;
};
class SigmoidModel {
public:
~SigmoidModel() {}
int load(const char* sigmoid_w_file,
const char* sigmoid_b_file,
float exp_max,
float exp_min) {
AutoLock lock(GlobalSigmoidCreateMutex::instance());
if (0 != _sigmoid_w.init(2, 1, sigmoid_w_file) || 0 != _sigmoid_w.load()) {
LOG(ERROR) << "load params sigmoid_w failed.";
return -1;
}
VLOG(2) << "load sigmoid_w [" << _sigmoid_w._params[0] << "] ["
<< _sigmoid_w._params[1] << "].";
if (0 != _sigmoid_b.init(2, 1, sigmoid_b_file) || 0 != _sigmoid_b.load()) {
LOG(ERROR) << "load params sigmoid_b failed.";
return -1;
}
VLOG(2) << "load sigmoid_b [" << _sigmoid_b._params[0] << "] ["
<< _sigmoid_b._params[1] << "].";
_exp_max_input = exp_max;
_exp_min_input = exp_min;
return 0;
}
int softmax(float x, double& o) { // NOLINT
float _y0 = x * _sigmoid_w._params[0] + _sigmoid_b._params[0];
float _y1 = x * _sigmoid_w._params[1] + _sigmoid_b._params[1];
_y0 = (_y0 > _exp_max_input)
? _exp_max_input
: ((_y0 < _exp_min_input) ? _exp_min_input : _y0);
_y1 = (_y1 > _exp_max_input)
? _exp_max_input
: ((_y1 < _exp_min_input) ? _exp_min_input : _y1);
o = 1.0f / (1.0f + exp(_y0 - _y1));
return 0;
}
public:
Parameter _sigmoid_w;
Parameter _sigmoid_b;
float _exp_max_input;
float _exp_min_input;
};
class SigmoidFluidModel {
public:
int softmax(float x, double& o) { // NOLINT
return _sigmoid_core->softmax(x, o);
} // NOLINT
std::unique_ptr<SigmoidFluidModel> Clone() {
std::unique_ptr<SigmoidFluidModel> clone_model;
clone_model.reset(new SigmoidFluidModel());
clone_model->_sigmoid_core = _sigmoid_core;
clone_model->_fluid_core = _fluid_core->Clone();
return std::move(clone_model);
}
public:
std::unique_ptr<paddle::PaddlePredictor> _fluid_core;
std::shared_ptr<SigmoidModel> _sigmoid_core;
};
class FluidGpuWithSigmoidCore : public FluidFamilyCore {
public:
virtual ~FluidGpuWithSigmoidCore() {}
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string model_path = params.get_path();
size_t pos = model_path.find_last_of("/\\");
std::string conf_path = model_path.substr(0, pos);
std::string conf_file = model_path.substr(pos);
configure::SigmoidConf conf;
if (configure::read_proto_conf(conf_path, conf_file, &conf) != 0) {
LOG(ERROR) << "failed load model path: " << model_path;
return -1;
}
_core.reset(new SigmoidFluidModel);
std::string fluid_model_data_path = conf.dnn_model_path();
predictor::InferEngineCreationParams new_params(params);
new_params.set_path(fluid_model_data_path);
int ret = load_fluid_model(new_params);
if (ret < 0) {
LOG(ERROR) << "fail to load fluid model.";
return -1;
}
const char* sigmoid_w_file = conf.sigmoid_w_file().c_str();
const char* sigmoid_b_file = conf.sigmoid_b_file().c_str();
float exp_max = conf.exp_max_input();
float exp_min = conf.exp_min_input();
_core->_sigmoid_core.reset(new SigmoidModel);
LOG(INFO) << "create sigmoid core[" << _core->_sigmoid_core.get()
<< "], use count[" << _core->_sigmoid_core.use_count() << "].";
ret = _core->_sigmoid_core->load(
sigmoid_w_file, sigmoid_b_file, exp_max, exp_min);
if (ret < 0) {
LOG(ERROR) << "fail to load sigmoid model.";
return -1;
}
return 0;
}
virtual bool Run(const void* in_data, void* out_data) {
if (!_core->_fluid_core->Run(
*(std::vector<paddle::PaddleTensor>*)in_data,
(std::vector<paddle::PaddleTensor>*)out_data)) {
LOG(ERROR) << "Failed call Run with paddle predictor";
return false;
}
return true;
}
virtual int clone(SigmoidFluidModel* origin_core) {
if (origin_core == NULL) {
LOG(ERROR) << "origin paddle Predictor is null.";
return -1;
}
_core = origin_core->Clone();
if (_core.get() == NULL) {
LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
return -1;
}
LOG(INFO) << "clone sigmoid core[" << _core->_sigmoid_core.get()
<< "] use count[" << _core->_sigmoid_core.use_count() << "].";
return 0;
}
virtual SigmoidFluidModel* get() { return _core.get(); }
virtual int load_fluid_model(
const predictor::InferEngineCreationParams& params) = 0;
int softmax(float x, double& o) { // NOLINT
return _core->_sigmoid_core->softmax(x, o);
}
protected:
std::unique_ptr<SigmoidFluidModel> _core;
};
class FluidGpuNativeDirWithSigmoidCore : public FluidGpuWithSigmoidCore {
public:
int load_fluid_model(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::NativeConfig native_config;
native_config.model_dir = data_path;
native_config.use_gpu = true;
native_config.fraction_of_gpu_memory = 0.01;
native_config.device = FLAGS_gpuid;
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core->_fluid_core =
paddle::CreatePaddlePredictor<paddle::NativeConfig,
paddle::PaddleEngineKind::kNative>(
native_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
class FluidGpuAnalysisDirWithSigmoidCore : public FluidGpuWithSigmoidCore {
public:
int load_fluid_model(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
if (access(data_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: "
<< data_path;
return -1;
}
paddle::AnalysisConfig analysis_config;
analysis_config.SetModel(data_path);
analysis_config.EnableUseGpu(100, FLAGS_gpuid);
analysis_config.SwitchSpecifyInputNames(true);
analysis_config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
analysis_config.EnableMemoryOptim();
}
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core->_fluid_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
if (NULL == _core.get()) {
LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
return -1;
}
VLOG(2) << "create paddle predictor sucess, path: " << data_path;
return 0;
}
};
} // namespace fluid_gpu
} // namespace paddle_serving
} // namespace baidu
......@@ -32,28 +32,6 @@ REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_GPU_ANALYSIS_DIR");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<
FluidGpuAnalysisDirWithSigmoidCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_GPU_ANALYSIS_DIR_SIGMOID");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<FluidGpuNativeCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_GPU_NATIVE");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<FluidGpuNativeDirCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_GPU_NATIVE_DIR");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<
FluidGpuNativeDirWithSigmoidCore>,
::baidu::paddle_serving::predictor::InferEngine,
"FLUID_GPU_NATIVE_DIR_SIGMOID");
} // namespace fluid_gpu
} // namespace paddle_serving
} // namespace baidu
......@@ -7,7 +7,7 @@ if (CLIENT)
endif()
if (SERVER)
if (NOT WITH_GPU)
if (NOT WITH_GPU AND NOT WITH_LITE)
file(INSTALL pipeline DESTINATION paddle_serving_server)
file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
else()
......@@ -34,7 +34,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.app.in
endif()
if (SERVER)
if (NOT WITH_GPU)
if (NOT WITH_GPU AND NOT WITH_LITE)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
else()
......@@ -72,7 +72,7 @@ add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINA
endif()
if (SERVER)
if(NOT WITH_GPU)
if(NOT WITH_GPU AND NOT WITH_LITE)
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server/ ${PADDLE_SERVING_BINARY_DIR}/python/
......@@ -81,12 +81,30 @@ if (SERVER)
DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
elseif(WITH_TRT)
if(CUDA_VERSION EQUAL 10.1)
set(SUFFIX 101)
elseif(CUDA_VERSION EQUAL 10.2)
set(SUFFIX 102)
elseif(CUDA_VERSION EQUAL 11.0)
set(SUFFIX 110)
endif()
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
COMMAND cp -r
${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
"server_gpu" ${SUFFIX}
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
elseif(WITH_LITE)
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
COMMAND cp -r
${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
"server_gpu" trt
"server_gpu" arm
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
......
# Encryption Model Prediction
([简体中文](README_CN.md)|English)
## Get Origin Model
The example uses the model file of the fit_a_line example as a origin model
```
sh get_data.sh
```
## Encrypt Model
```
python encrypt.py
```
The key is stored in the `key` file, and the encrypted model file and server-side configuration file are stored in the `encrypt_server` directory.
client-side configuration file are stored in the `encrypt_client` directory.
## Start Encryption Service
CPU Service
```
python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model
```
GPU Service
```
python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
```
## Prediction
```
python test_client.py uci_housing_client/serving_client_conf.prototxt
```
# 加密模型预测
(简体中文|[English](README.md))
## 获取明文模型
示例中使用fit_a_line示例的模型文件作为明文模型
```
sh get_data.sh
```
## 模型加密
```
python encrypt.py
```
密钥保存在`key`文件中,加密模型文件以及server端配置文件保存在`encrypt_server`目录下,client端配置文件保存在`encrypt_client`目录下。
## 启动加密预测服务
CPU预测服务
```
python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model
```
GPU预测服务
```
python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
```
## 预测
```
python test_client.py uci_housing_client/serving_client_conf.prototxt
```
......@@ -11,21 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
import numpy as np
from paddle_serving_client.io import inference_model_to_serving
client = Client()
client.connect(["127.0.0.1:9393"])
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
for i in range(3):
fetch_map = client.predict(feed={"x": np.array(x)}, fetch=["price"])
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
else:
print(fetch_map["serving_status_code"])
def serving_encryption():
inference_model_to_serving(
dirname="./uci_housing_model",
serving_server="encrypt_server",
serving_client="encrypt_client",
encryption=True)
if __name__ == "__main__":
serving_encryption()
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing_example/encrypt.tar.gz
tar -xzf encrypt.tar.gz
cp -rvf ../fit_a_line/uci_housing_model .
cp -rvf ../fit_a_line/uci_housing_client .
......@@ -13,18 +13,20 @@
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
from paddle_serving_client import Client
import sys
client = Client()
client.connect(["127.0.0.1:9393"])
client.load_client_config(sys.argv[1])
client.use_key("./key")
client.connect(["127.0.0.1:9300"], encryption=True)
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
for i in range(3):
fetch_map = client.predict(feed={"x": x}, fetch=["price"], is_python=False)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
else:
print(fetch_map["serving_status_code"])
import paddle
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.test(), buf_size=500),
batch_size=1)
for data in test_reader():
fetch_map = client.predict(feed={"x": data[0][0]}, fetch=["price"])
print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
......@@ -16,7 +16,7 @@
import sys
import paddle
import paddle.fluid as fluid
paddle.enable_static()
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.train(), buf_size=500),
......
......@@ -38,20 +38,9 @@ python test_asyn_client.py
python test_batch_client.py
```
### 通用 pb 预测
``` shell
python test_general_pb_client.py
```
### 预测超时
``` shell
python test_timeout_client.py
```
### List 输入
``` shell
python test_list_input_client.py
```
......@@ -18,7 +18,7 @@ import functools
import time
import threading
import grpc
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
......@@ -43,7 +43,8 @@ x = [
]
task_count = 0
for i in range(3):
future = client.predict(feed={"x": x}, fetch=["price"], asyn=True)
new_data = np.array(x).astype("float32").reshape((1,13))
future = client.predict(feed={"x": new_data}, fetch=["price"], batch=False, asyn=True)
task_count += 1
future.add_done_callback(functools.partial(call_back))
......
......@@ -13,7 +13,7 @@
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
......@@ -24,8 +24,11 @@ x = [
]
for i in range(3):
batch_feed = [{"x": x} for j in range(batch_size)]
fetch_map = client.predict(feed=batch_feed, fetch=["price"])
new_data = np.array(x).astype("float32").reshape((1, 1, 13))
batch_data = np.concatenate([new_data, new_data, new_data], axis=0)
print(batch_data.shape)
fetch_map = client.predict(feed={"x":batch_data}, fetch=["price"], batch=True)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
else:
......
......@@ -14,16 +14,27 @@
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
"""
for data in test_reader():
new_data = np.zeros((1, 1, 13)).astype("float32")
new_data[0] = data[0][0]
fetch_map = client.predict(
feed={"x": new_data}, fetch=["price"], batch=True)
print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
print(fetch_map)
"""
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
for i in range(3):
fetch_map = client.predict(feed={"x": x}, fetch=["price"])
new_data = np.array(x).astype("float32").reshape((1,13))
fetch_map = client.predict(feed={"x": new_data}, fetch=["price"], batch=False)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
else:
......
......@@ -15,17 +15,18 @@
from paddle_serving_client import MultiLangClient as Client
import grpc
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
client.set_rpc_timeout_ms(1)
client.set_rpc_timeout_ms(40)
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
for i in range(3):
fetch_map = client.predict(feed={"x": x}, fetch=["price"])
new_data = np.array(x).astype("float32").reshape((1,13))
fetch_map = client.predict(feed={"x": new_data}, fetch=["price"], batch=False)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
elif fetch_map["serving_status_code"] == grpc.StatusCode.DEADLINE_EXCEEDED:
......
......@@ -27,7 +27,7 @@ preprocess = Sequential([
postprocess = RCNNPostprocess("label_list.txt", "output", [608, 608])
client = Client()
client.connect(['127.0.0.1:9393'])
# client.set_rpc_timeout_ms(10000)
client.set_rpc_timeout_ms(15000)
im = preprocess(sys.argv[1])
fetch_map = client.predict(
......@@ -35,7 +35,8 @@ fetch_map = client.predict(
"image": im,
"im_size": np.array(list(im.shape[1:])),
},
fetch=["save_infer_model/scale_0.tmp_0"])
fetch=["save_infer_model/scale_0.tmp_0"], batch=False)
print(fetch_map)
fetch_map.pop("serving_status_code")
fetch_map["image"] = sys.argv[1]
postprocess(fetch_map)
......@@ -20,6 +20,9 @@ op:
#uci模型路径
model_config: ResNet50_vd_model
#计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 1
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0" # "0,1"
......
......@@ -3,6 +3,7 @@
worker_num: 1
#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时,不自动生成http_port
rpc_port: 9998
http_port: 18082
dag:
......@@ -19,8 +20,11 @@ op:
#uci模型路径
model_config: uci_housing_model
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0" # "0,1"
#计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 0
#计算硬件ID,优先由device_type决定硬件类型。devices为""或空缺时为CPU预测;当为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "" # "0,1"
#client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测
client_type: local_predictor
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
try:
from paddle_serving_server.web_service import WebService, Op
except ImportError:
from paddle_serving_server.web_service import WebService, Op
import logging
import numpy as np
from numpy import array
import sys
import base64
_LOGGER = logging.getLogger()
np.set_printoptions(threshold=sys.maxsize)
class UciOp(Op):
def init_op(self):
self.separator = ","
def preprocess(self, input_dicts, data_id, log_id):
"""
diff with web_server.py
javaclient input type is INDArray, restful request input is list.
this function simply reshape input to the Specified shape.
"""
(_, input_dict), = input_dicts.items()
_LOGGER.error("UciOp::preprocess >>> log_id:{}, input:{}".format(
log_id, input_dict))
proc_dict = {}
x_value = input_dict["x"]
input_dict["x"] = x_value.reshape(1,13)
return input_dict, False, None, ""
def postprocess(self, input_dicts, fetch_dict, log_id):
_LOGGER.info("UciOp::postprocess >>> log_id:{}, fetch_dict:{}".format(
log_id, fetch_dict))
fetch_dict["price"] = str(fetch_dict["price"][0][0])
return fetch_dict, None, ""
class UciService(WebService):
def get_pipeline_response(self, read_op):
uci_op = UciOp(name="uci", input_ops=[read_op])
return uci_op
uci_service = UciService(name="uci")
uci_service.prepare_pipeline_config("config.yml")
uci_service.run_service()
......@@ -20,6 +20,7 @@ import google.protobuf.text_format
import numpy as np
import argparse
import paddle.fluid as fluid
import paddle.inference as inference
from .proto import general_model_config_pb2 as m_config
from paddle.fluid.core import PaddleTensor
from paddle.fluid.core import AnalysisConfig
......@@ -57,6 +58,8 @@ class LocalPredictor(object):
mem_optim=True,
ir_optim=False,
use_trt=False,
use_lite=False,
use_xpu=False,
use_feed_fetch_ops=False):
"""
Load model config and set the engine config for the paddle predictor
......@@ -70,6 +73,8 @@ class LocalPredictor(object):
mem_optim: memory optimization, True default.
ir_optim: open calculation chart optimization, False default.
use_trt: use nvidia TensorRT optimization, False default
use_lite: use Paddle-Lite Engint, False default
use_xpu: run predict on Baidu Kunlun, False default
use_feed_fetch_ops: use feed/fetch ops, False default.
"""
client_config = "{}/serving_server_conf.prototxt".format(model_path)
......@@ -80,9 +85,9 @@ class LocalPredictor(object):
config = AnalysisConfig(model_path)
logger.info("load_model_config params: model_path:{}, use_gpu:{},\
gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
use_trt:{}, use_feed_fetch_ops:{}".format(
use_trt:{}, use_lite:{}, use_xpu: {}, use_feed_fetch_ops:{}".format(
model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
ir_optim, use_trt, use_feed_fetch_ops))
ir_optim, use_trt, use_lite, use_xpu, use_feed_fetch_ops))
self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
......@@ -119,6 +124,16 @@ class LocalPredictor(object):
use_static=False,
use_calib_mode=False)
if use_lite:
config.enable_lite_engine(
precision_mode=inference.PrecisionType.Float32,
zero_copy=True,
passes_filter=[],
ops_filter=[])
if use_xpu:
config.enable_xpu(8 * 1024 * 1024)
self.predictor = create_paddle_predictor(config)
def predict(self, feed=None, fetch=None, batch=False, log_id=0):
......
......@@ -522,78 +522,48 @@ class MultiLangClient(object):
req.fetch_var_names.extend(fetch)
req.is_python = is_python
req.log_id = log_id
feed_batch = None
if isinstance(feed, dict):
feed_batch = [feed]
elif isinstance(feed, list):
feed_batch = feed
else:
raise Exception("{} not support".format(type(feed)))
req.feed_var_names.extend(feed_batch[0].keys())
init_feed_names = False
for feed_data in feed_batch:
inst = multi_lang_general_model_service_pb2.FeedInst()
for name in req.feed_var_names:
tensor = multi_lang_general_model_service_pb2.Tensor()
var = feed_data[name]
v_type = self.feed_types_[name]
if is_python:
data = None
if isinstance(var, list):
if v_type == 0: # int64
data = np.array(var, dtype="int64")
elif v_type == 1: # float32
data = np.array(var, dtype="float32")
elif v_type == 2: # int32
data = np.array(var, dtype="int32")
else:
raise Exception("error tensor value type.")
elif isinstance(var, np.ndarray):
data = var
if v_type == 0:
if data.dtype != 'int64':
data = data.astype("int64")
elif v_type == 1:
if data.dtype != 'float32':
data = data.astype("float32")
elif v_type == 2:
if data.dtype != 'int32':
data = data.astype("int32")
else:
raise Exception("error tensor value type.")
feed_var_names = []
for key in feed.keys():
if '.lod' not in key:
feed_var_names.append(key)
req.feed_var_names.extend(feed_var_names)
inst = multi_lang_general_model_service_pb2.FeedInst()
for name in req.feed_var_names:
tensor = multi_lang_general_model_service_pb2.Tensor()
var = feed[name]
v_type = self.feed_types_[name]
if is_python:
data = None
if isinstance(var, list):
if v_type == 0: # int64
data = np.array(var, dtype="int64")
elif v_type == 1: # float32
data = np.array(var, dtype="float32")
elif v_type == 2: # int32
data = np.array(var, dtype="int32")
else:
raise Exception("var must be list or ndarray.")
tensor.data = data.tobytes()
else:
if isinstance(var, np.ndarray):
if v_type == 0: # int64
tensor.int64_data.extend(
var.reshape(-1).astype("int64").tolist())
elif v_type == 1:
tensor.float_data.extend(
var.reshape(-1).astype('float32').tolist())
elif v_type == 2:
tensor.int_data.extend(
var.reshape(-1).astype('int32').tolist())
else:
raise Exception("error tensor value type.")
elif isinstance(var, list):
if v_type == 0:
tensor.int64_data.extend(self._flatten_list(var))
elif v_type == 1:
tensor.float_data.extend(self._flatten_list(var))
elif v_type == 2:
tensor.int_data.extend(self._flatten_list(var))
else:
raise Exception("error tensor value type.")
raise Exception("error tensor value type.")
elif isinstance(var, np.ndarray):
data = var
if v_type == 0:
if data.dtype != 'int64':
data = data.astype("int64")
elif v_type == 1:
if data.dtype != 'float32':
data = data.astype("float32")
elif v_type == 2:
if data.dtype != 'int32':
data = data.astype("int32")
else:
raise Exception("var must be list or ndarray.")
if isinstance(var, np.ndarray):
tensor.shape.extend(list(var.shape))
raise Exception("error tensor value type.")
else:
tensor.shape.extend(self.feed_shapes_[name])
inst.tensor_array.append(tensor)
req.insts.append(inst)
raise Exception("var must be list or ndarray.")
tensor.data = data.tobytes()
tensor.shape.extend(list(var.shape))
if "{}.lod".format(name) in feed.keys():
tensor.lod.extend(feed["{}.lod".format(name)])
inst.tensor_array.append(tensor)
req.insts.append(inst)
return req
def _unpack_inference_response(self, resp, fetch, is_python,
......@@ -652,10 +622,17 @@ class MultiLangClient(object):
def predict(self,
feed,
fetch,
batch=True,
need_variant_tag=False,
asyn=False,
is_python=True,
log_id=0):
if isinstance(feed, dict) is False:
raise ValueError("Type Error. grpc feed must be dict.")
if batch is False:
for key in feed:
if ".lod" not in key:
feed[key] = feed[key][np.newaxis, :]
if not asyn:
try:
self.profile_.record('py_prepro_0')
......
......@@ -23,7 +23,90 @@ from paddle.fluid.io import save_inference_model
import paddle.fluid as fluid
from ..proto import general_model_config_pb2 as model_conf
import os
import paddle
import paddle.nn.functional as F
from paddle.jit import to_static
def save_dygraph_model(serving_model_folder, client_config_folder, model):
paddle.jit.save(model, "serving_tmp")
loaded_layer = paddle.jit.load(path=".", model_filename="serving_tmp.pdmodel", params_filename="serving_tmp.pdiparams")
feed_target_names = [x.name for x in loaded_layer._input_spec()]
fetch_target_names = [x.name for x in loaded_layer._output_spec()]
inference_program = loaded_layer.program()
feed_var_dict = {
x: inference_program.global_block().var(x)
for x in feed_target_names
}
fetch_var_dict = {
x: inference_program.global_block().var(x)
for x in fetch_target_names
}
config = model_conf.GeneralModelConfig()
#int64 = 0; float32 = 1; int32 = 2;
for key in feed_var_dict:
feed_var = model_conf.FeedVar()
feed_var.alias_name = key
feed_var.name = feed_var_dict[key].name
feed_var.is_lod_tensor = feed_var_dict[key].lod_level >= 1
if feed_var_dict[key].dtype == core.VarDesc.VarType.INT64:
feed_var.feed_type = 0
if feed_var_dict[key].dtype == core.VarDesc.VarType.FP32:
feed_var.feed_type = 1
if feed_var_dict[key].dtype == core.VarDesc.VarType.INT32:
feed_var.feed_type = 2
if feed_var.is_lod_tensor:
feed_var.shape.extend([-1])
else:
tmp_shape = []
for v in feed_var_dict[key].shape:
if v >= 0:
tmp_shape.append(v)
feed_var.shape.extend(tmp_shape)
config.feed_var.extend([feed_var])
for key in fetch_var_dict:
fetch_var = model_conf.FetchVar()
fetch_var.alias_name = key
fetch_var.name = fetch_var_dict[key].name
fetch_var.is_lod_tensor = 1
if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT64:
fetch_var.fetch_type = 0
if fetch_var_dict[key].dtype == core.VarDesc.VarType.FP32:
fetch_var.fetch_type = 1
if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT32:
fetch_var.fetch_type = 2
if fetch_var.is_lod_tensor:
fetch_var.shape.extend([-1])
else:
tmp_shape = []
for v in fetch_var_dict[key].shape:
if v >= 0:
tmp_shape.append(v)
fetch_var.shape.extend(tmp_shape)
config.fetch_var.extend([fetch_var])
cmd = "mkdir -p {}".format(client_config_folder)
os.system(cmd)
cmd = "mkdir -p {}".format(serving_model_folder)
os.system(cmd)
cmd = "mv {} {}/__model__".format("serving_tmp.pdmodel", serving_model_folder)
os.system(cmd)
cmd = "mv {} {}/__params__".format("serving_tmp.pdiparams", serving_model_folder)
os.system(cmd)
cmd = "rm -rf serving_tmp.pd*"
os.system(cmd)
with open("{}/serving_client_conf.prototxt".format(client_config_folder),
"w") as fout:
fout.write(str(config))
with open("{}/serving_server_conf.prototxt".format(serving_model_folder),
"w") as fout:
fout.write(str(config))
with open("{}/serving_client_conf.stream.prototxt".format(
client_config_folder), "wb") as fout:
fout.write(config.SerializeToString())
with open("{}/serving_server_conf.stream.prototxt".format(
serving_model_folder), "wb") as fout:
fout.write(config.SerializeToString())
def save_model(server_model_folder,
client_config_folder,
......@@ -44,6 +127,8 @@ def save_model(server_model_folder,
feed_var_names,
target_vars,
executor,
model_filename="__model__",
params_filename="__params__",
main_program=main_program)
config = model_conf.GeneralModelConfig()
......
......@@ -230,11 +230,15 @@ class Server(object):
engine.enable_ir_optimization = self.ir_optimization
engine.static_optimization = False
engine.force_update_static_cache = False
if os.path.exists('{}/__params__'.format(model_config_path)):
suffix = ""
else:
suffix = "_DIR"
if device == "cpu":
engine.type = "FLUID_CPU_ANALYSIS_DIR"
engine.type = "FLUID_CPU_ANALYSIS" + suffix
elif device == "gpu":
engine.type = "FLUID_GPU_ANALYSIS_DIR"
engine.type = "FLUID_GPU_ANALYSIS" + suffix
self.model_toolkit_conf.engines.extend([engine])
......@@ -523,35 +527,26 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
fetch_names = list(request.fetch_var_names)
is_python = request.is_python
log_id = request.log_id
feed_batch = []
for feed_inst in request.insts:
feed_dict = {}
for idx, name in enumerate(feed_names):
var = feed_inst.tensor_array[idx]
v_type = self.feed_types_[name]
data = None
if is_python:
if v_type == 0: # int64
data = np.frombuffer(var.data, dtype="int64")
elif v_type == 1: # float32
data = np.frombuffer(var.data, dtype="float32")
elif v_type == 2: # int32
data = np.frombuffer(var.data, dtype="int32")
else:
raise Exception("error type.")
feed_dict = {}
feed_inst = request.insts[0]
for idx, name in enumerate(feed_names):
var = feed_inst.tensor_array[idx]
v_type = self.feed_types_[name]
data = None
if is_python:
if v_type == 0: # int64
data = np.frombuffer(var.data, dtype="int64")
elif v_type == 1: # float32
data = np.frombuffer(var.data, dtype="float32")
elif v_type == 2: # int32
data = np.frombuffer(var.data, dtype="int32")
else:
if v_type == 0: # int64
data = np.array(list(var.int64_data), dtype="int64")
elif v_type == 1: # float32
data = np.array(list(var.float_data), dtype="float32")
elif v_type == 2: # int32
data = np.array(list(var.int_data), dtype="int32")
else:
raise Exception("error type.")
data.shape = list(feed_inst.tensor_array[idx].shape)
feed_dict[name] = data
feed_batch.append(feed_dict)
return feed_batch, fetch_names, is_python, log_id
raise Exception("error type.")
data.shape = list(feed_inst.tensor_array[idx].shape)
feed_dict[name] = data
if len(var.lod) > 0:
feed_dict["{}.lod".format()] = var.lod
return feed_dict, fetch_names, is_python, log_id
def _pack_inference_response(self, ret, fetch_names, is_python):
resp = multi_lang_general_model_service_pb2.InferenceResponse()
......@@ -608,6 +603,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
ret = self.bclient_.predict(
feed=feed_dict,
fetch=fetch_names,
batch=True,
need_variant_tag=True,
log_id=log_id)
return self._pack_inference_response(ret, fetch_names, is_python)
......
......@@ -77,6 +77,10 @@ def serve_args():
help="Use Multi-language-service")
parser.add_argument(
"--use_trt", default=False, action="store_true", help="Use TensorRT")
parser.add_argument(
"--use_lite", default=False, action="store_true", help="Use PaddleLite")
parser.add_argument(
"--use_xpu", default=False, action="store_true", help="Use XPU")
parser.add_argument(
"--product_name",
type=str,
......@@ -210,6 +214,8 @@ class Server(object):
self.use_local_bin = False
self.gpuid = 0
self.use_trt = False
self.use_lite = False
self.use_xpu = False
self.model_config_paths = None # for multi-model in a workflow
self.product_name = None
self.container_id = None
......@@ -279,6 +285,12 @@ class Server(object):
def set_trt(self):
self.use_trt = True
def set_lite(self):
self.use_lite = True
def set_xpu(self):
self.use_xpu = True
def _prepare_engine(self, model_config_paths, device):
if self.model_toolkit_conf == None:
self.model_toolkit_conf = server_sdk.ModelToolkitConf()
......@@ -299,11 +311,17 @@ class Server(object):
engine.static_optimization = False
engine.force_update_static_cache = False
engine.use_trt = self.use_trt
engine.use_lite = self.use_lite
engine.use_xpu = self.use_xpu
if device == "cpu":
engine.type = "FLUID_CPU_ANALYSIS_DIR"
elif device == "gpu":
engine.type = "FLUID_GPU_ANALYSIS_DIR"
elif device == "arm":
engine.type = "FLUID_ARM_ANALYSIS_DIR"
self.model_toolkit_conf.engines.extend([engine])
......@@ -405,10 +423,12 @@ class Server(object):
for line in version_file.readlines():
if re.match("cuda_version", line):
cuda_version = line.split("\"")[1]
if cuda_version != "trt":
device_version = "serving-gpu-cuda" + cuda_version + "-"
else:
if cuda_version == "101" or cuda_version == "102" or cuda_version == "110":
device_version = "serving-gpu-" + cuda_version + "-"
elif cuda_version == "arm":
device_version = "serving-" + cuda_version + "-"
else:
device_version = "serving-gpu-cuda" + cuda_version + "-"
folder_name = device_version + serving_server_version
tar_name = folder_name + ".tar.gz"
......@@ -507,36 +527,65 @@ class Server(object):
time.sleep(1)
else:
print("Use local bin : {}".format(self.bin_path))
self.check_cuda()
command = "{} " \
"-enable_model_toolkit " \
"-inferservice_path {} " \
"-inferservice_file {} " \
"-max_concurrency {} " \
"-num_threads {} " \
"-port {} " \
"-reload_interval_s {} " \
"-resource_path {} " \
"-resource_file {} " \
"-workflow_path {} " \
"-workflow_file {} " \
"-bthread_concurrency {} " \
"-gpuid {} " \
"-max_body_size {} ".format(
self.bin_path,
self.workdir,
self.infer_service_fn,
self.max_concurrency,
self.num_threads,
self.port,
self.reload_interval_s,
self.workdir,
self.resource_fn,
self.workdir,
self.workflow_fn,
self.num_threads,
self.gpuid,
self.max_body_size)
#self.check_cuda()
if self.use_lite:
command = "{} " \
"-enable_model_toolkit " \
"-inferservice_path {} " \
"-inferservice_file {} " \
"-max_concurrency {} " \
"-num_threads {} " \
"-port {} " \
"-reload_interval_s {} " \
"-resource_path {} " \
"-resource_file {} " \
"-workflow_path {} " \
"-workflow_file {} " \
"-bthread_concurrency {} " \
"-max_body_size {} ".format(
self.bin_path,
self.workdir,
self.infer_service_fn,
self.max_concurrency,
self.num_threads,
self.port,
self.reload_interval_s,
self.workdir,
self.resource_fn,
self.workdir,
self.workflow_fn,
self.num_threads,
self.max_body_size)
else:
command = "{} " \
"-enable_model_toolkit " \
"-inferservice_path {} " \
"-inferservice_file {} " \
"-max_concurrency {} " \
"-num_threads {} " \
"-port {} " \
"-reload_interval_s {} " \
"-resource_path {} " \
"-resource_file {} " \
"-workflow_path {} " \
"-workflow_file {} " \
"-bthread_concurrency {} " \
"-gpuid {} " \
"-max_body_size {} ".format(
self.bin_path,
self.workdir,
self.infer_service_fn,
self.max_concurrency,
self.num_threads,
self.port,
self.reload_interval_s,
self.workdir,
self.resource_fn,
self.workdir,
self.workflow_fn,
self.num_threads,
self.gpuid,
self.max_body_size)
print("Going to Run Comand")
print(command)
......
......@@ -38,7 +38,9 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss
ir_optim = args.ir_optim
max_body_size = args.max_body_size
use_multilang = args.use_multilang
workdir = "{}_{}".format(args.workdir, gpuid)
workdir = args.workdir
if gpuid >= 0:
workdir = "{}_{}".format(args.workdir, gpuid)
if model == "":
print("You must specify your serving model")
......@@ -67,6 +69,13 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss
if args.use_trt:
server.set_trt()
if args.use_lite:
server.set_lite()
device = "arm"
if args.use_xpu:
server.set_xpu()
if args.product_name != None:
server.set_product_name(args.product_name)
if args.container_id != None:
......@@ -95,7 +104,10 @@ def start_multi_card(args): # pylint: disable=doc-string-missing
exit(-1)
else:
env_gpus = []
if len(gpus) <= 0:
if args.use_lite:
print("run arm server.")
start_gpu_card_model(-1, -1, args)
elif len(gpus) <= 0:
print("gpu_ids not set, going to run cpu service.")
start_gpu_card_model(-1, -1, args)
else:
......@@ -128,7 +140,8 @@ if __name__ == "__main__":
if len(gpu_ids) > 0:
web_service.set_gpus(gpu_ids)
web_service.prepare_server(
workdir=args.workdir, port=args.port, device=args.device)
workdir=args.workdir, port=args.port, device=args.device,
use_lite=args.use_lite, use_xpu=args.use_xpu, ir_optim=args.ir_optim)
web_service.run_rpc_service()
app_instance = Flask(__name__)
......
......@@ -83,10 +83,15 @@ class WebService(object):
gpuid=0,
thread_num=2,
mem_optim=True,
use_lite=False,
use_xpu=False,
ir_optim=False):
device = "gpu"
if gpuid == -1:
device = "cpu"
if use_lite:
device = "arm"
else:
device = "cpu"
op_maker = serving.OpMaker()
read_op = op_maker.create('general_reader')
general_infer_op = op_maker.create('general_infer')
......@@ -103,6 +108,11 @@ class WebService(object):
server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim)
if use_lite:
server.set_lite()
if use_xpu:
server.set_xpu()
server.load_model_config(self.model_config)
if gpuid >= 0:
server.set_gpuid(gpuid)
......@@ -125,9 +135,11 @@ class WebService(object):
workdir="",
port=9393,
device="gpu",
use_lite=False,
use_xpu=False,
ir_optim=False,
gpuid=0,
mem_optim=True,
ir_optim=False):
mem_optim=True):
print("This API will be deprecated later. Please do not use it")
self.workdir = workdir
self.port = port
......@@ -150,6 +162,8 @@ class WebService(object):
-1,
thread_num=2,
mem_optim=mem_optim,
use_lite=use_lite,
use_xpu=use_xpu,
ir_optim=ir_optim))
else:
for i, gpuid in enumerate(self.gpus):
......@@ -160,6 +174,8 @@ class WebService(object):
gpuid,
thread_num=2,
mem_optim=mem_optim,
use_lite=use_lite,
use_xpu=use_xpu,
ir_optim=ir_optim))
def _launch_web_service(self):
......
......@@ -38,12 +38,12 @@ class LocalServiceHandler(object):
client_type='local_predictor',
workdir="",
thread_num=2,
device_type=-1,
devices="",
fetch_names=None,
mem_optim=True,
ir_optim=False,
available_port_generator=None,
use_trt=False,
use_profile=False):
"""
Initialization of localservicehandler
......@@ -53,13 +53,14 @@ class LocalServiceHandler(object):
client_type: brpc, grpc and local_predictor[default]
workdir: work directory
thread_num: number of threads, concurrent quantity.
device_type: support multiple devices. -1=Not set, determined by
`devices`. 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
devices: gpu id list[gpu], "" default[cpu]
fetch_names: get fetch names out of LocalServiceHandler in
local_predictor mode. fetch_names_ is compatible for Client().
mem_optim: use memory/graphics memory optimization, True default.
ir_optim: use calculation chart optimization, False default.
available_port_generator: generate available ports
use_trt: use nvidia tensorRt engine, False default.
use_profile: use profiling, False default.
Returns:
......@@ -70,22 +71,61 @@ class LocalServiceHandler(object):
self._model_config = model_config
self._port_list = []
self._device_type = "cpu"
if devices == "":
# cpu
self._device_name = "cpu"
self._use_gpu = False
self._use_trt = False
self._use_lite = False
self._use_xpu = False
if device_type == -1:
# device_type is not set, determined by `devices`,
if devices == "":
# CPU
self._device_name = "cpu"
devices = [-1]
else:
# GPU
self._device_name = "gpu"
self._use_gpu = True
devices = [int(x) for x in devices.split(",")]
elif device_type == 0:
# CPU
self._device_name = "cpu"
devices = [-1]
self._device_type = "cpu"
self._port_list.append(available_port_generator.next())
_LOGGER.info("Model({}) will be launch in cpu device. Port({})"
.format(model_config, self._port_list))
else:
# gpu
self._device_type = "gpu"
elif device_type == 1:
# GPU
self._device_name = "gpu"
self._use_gpu = True
devices = [int(x) for x in devices.split(",")]
elif device_type == 2:
# Nvidia Tensor RT
self._device_name = "gpu"
self._use_gpu = True
devices = [int(x) for x in devices.split(",")]
self._use_trt = True
elif device_type == 3:
# ARM CPU
self._device_name = "arm"
devices = [-1]
self._use_lite = True
elif device_type == 4:
# Kunlun XPU
self._device_name = "arm"
devices = [int(x) for x in devices.split(",")]
self._use_lite = True
self._use_xpu = True
else:
_LOGGER.error(
"LocalServiceHandler initialization fail. device_type={}"
.format(device_type))
if client_type == "brpc" or client_type == "grpc":
for _ in devices:
self._port_list.append(available_port_generator.next())
_LOGGER.info("Model({}) will be launch in gpu device: {}. Port({})"
.format(model_config, devices, self._port_list))
_LOGGER.info("Create ports for devices:{}. Port:{}"
.format(devices, self._port_list))
self._client_type = client_type
self._workdir = workdir
self._devices = devices
......@@ -95,12 +135,21 @@ class LocalServiceHandler(object):
self._local_predictor_client = None
self._rpc_service_list = []
self._server_pros = []
self._use_trt = use_trt
self._use_profile = use_profile
self.fetch_names_ = fetch_names
self._fetch_names = fetch_names
_LOGGER.info(
"Models({}) will be launched by device {}. use_gpu:{}, "
"use_trt:{}, use_lite:{}, use_xpu:{}, device_type:{}, devices:{}, "
"mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, "
"client_type:{}, fetch_names:{}".format(
model_config, self._device_name, self._use_gpu, self._use_trt,
self._use_lite, self._use_xpu, device_type, self._devices,
self._mem_optim, self._ir_optim, self._use_profile,
self._thread_num, self._client_type, self._fetch_names))
def get_fetch_list(self):
return self.fetch_names_
return self._fetch_names
def get_port_list(self):
return self._port_list
......@@ -137,18 +186,18 @@ class LocalServiceHandler(object):
from paddle_serving_app.local_predict import LocalPredictor
if self._local_predictor_client is None:
self._local_predictor_client = LocalPredictor()
use_gpu = False
if self._device_type == "gpu":
use_gpu = True
self._local_predictor_client.load_model_config(
model_path=self._model_config,
use_gpu=use_gpu,
use_gpu=self._use_gpu,
gpu_id=self._devices[concurrency_idx],
use_profile=self._use_profile,
thread_num=self._thread_num,
mem_optim=self._mem_optim,
ir_optim=self._ir_optim,
use_trt=self._use_trt)
use_trt=self._use_trt,
use_lite=self._use_lite,
use_xpu=self._use_xpu)
return self._local_predictor_client
def get_client_config(self):
......@@ -157,7 +206,7 @@ class LocalServiceHandler(object):
def _prepare_one_server(self, workdir, port, gpuid, thread_num, mem_optim,
ir_optim):
"""
According to _device_type, generating one CpuServer or GpuServer, and
According to self._device_name, generating one Cpu/Gpu/Arm Server, and
setting the model config amd startup params.
Args:
......@@ -171,7 +220,7 @@ class LocalServiceHandler(object):
Returns:
server: CpuServer/GpuServer
"""
if self._device_type == "cpu":
if self._device_name == "cpu":
from paddle_serving_server import OpMaker, OpSeqMaker, Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
......@@ -185,7 +234,7 @@ class LocalServiceHandler(object):
server = Server()
else:
#gpu
#gpu or arm
from paddle_serving_server_gpu import OpMaker, OpSeqMaker, Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
......@@ -208,9 +257,9 @@ class LocalServiceHandler(object):
server.load_model_config(self._model_config)
server.prepare_server(
workdir=workdir, port=port, device=self._device_type)
if self.fetch_names_ is None:
self.fetch_names_ = server.get_fetch_list()
workdir=workdir, port=port, device=self._device_name)
if self._fetch_names is None:
self._fetch_names = server.get_fetch_list()
return server
def _start_one_server(self, service_idx):
......@@ -247,7 +296,7 @@ class LocalServiceHandler(object):
"""
Start multiple processes and start one server in each process
"""
for i, service in enumerate(self._rpc_service_list):
for i, _ in enumerate(self._rpc_service_list):
p = multiprocessing.Process(
target=self._start_one_server, args=(i, ))
p.daemon = True
......
......@@ -134,6 +134,7 @@ class Op(object):
self.model_config = None
self.workdir = None
self.thread_num = self.concurrency
self.device_type = -1
self.devices = ""
self.mem_optim = False
self.ir_optim = False
......@@ -153,6 +154,7 @@ class Op(object):
self.client_type = local_service_conf.get("client_type")
self.workdir = local_service_conf.get("workdir")
self.thread_num = local_service_conf.get("thread_num")
self.device_type = local_service_conf.get("device_type")
self.devices = local_service_conf.get("devices")
self.mem_optim = local_service_conf.get("mem_optim")
self.ir_optim = local_service_conf.get("ir_optim")
......@@ -168,6 +170,7 @@ class Op(object):
client_type=self.client_type,
workdir=self.workdir,
thread_num=self.thread_num,
device_type=self.device_type,
devices=self.devices,
mem_optim=self.mem_optim,
ir_optim=self.ir_optim)
......@@ -188,8 +191,11 @@ class Op(object):
client_type=self.client_type,
workdir=self.workdir,
thread_num=self.thread_num,
device_type=self.device_type,
devices=self.devices,
fetch_names=self._fetch_names)
fetch_names=self._fetch_names,
mem_optim=self.mem_optim,
ir_optim=self.ir_optim)
if self._client_config is None:
self._client_config = service_handler.get_client_config(
)
......@@ -550,7 +556,8 @@ class Op(object):
args=(concurrency_idx, self._get_input_channel(),
self._get_output_channels(), False, trace_buffer,
self.model_config, self.workdir, self.thread_num,
self.devices, self.mem_optim, self.ir_optim))
self.device_type, self.devices, self.mem_optim,
self.ir_optim))
p.daemon = True
p.start()
process.append(p)
......@@ -583,7 +590,8 @@ class Op(object):
args=(concurrency_idx, self._get_input_channel(),
self._get_output_channels(), True, trace_buffer,
self.model_config, self.workdir, self.thread_num,
self.devices, self.mem_optim, self.ir_optim))
self.device_type, self.devices, self.mem_optim,
self.ir_optim))
# When a process exits, it attempts to terminate
# all of its daemonic child processes.
t.daemon = True
......@@ -991,7 +999,7 @@ class Op(object):
def _run(self, concurrency_idx, input_channel, output_channels,
is_thread_op, trace_buffer, model_config, workdir, thread_num,
devices, mem_optim, ir_optim):
device_type, devices, mem_optim, ir_optim):
"""
_run() is the entry function of OP process / thread model.When client
type is local_predictor in process mode, the CUDA environment needs to
......@@ -1009,6 +1017,7 @@ class Op(object):
model_config: model config path
workdir: work directory
thread_num: number of threads, concurrent quantity
device_type: support multiple devices
devices: gpu id list[gpu], "" default[cpu]
mem_optim: use memory/graphics memory optimization, True default.
ir_optim: use calculation chart optimization, False default.
......@@ -1017,7 +1026,6 @@ class Op(object):
None
"""
op_info_prefix = "[{}|{}]".format(self.name, concurrency_idx)
tid = threading.current_thread().ident
# init ops
profiler = None
......@@ -1028,6 +1036,7 @@ class Op(object):
client_type="local_predictor",
workdir=workdir,
thread_num=thread_num,
device_type=device_type,
devices=devices,
mem_optim=mem_optim,
ir_optim=ir_optim)
......
......@@ -21,6 +21,7 @@ import contextlib
from contextlib import closing
import multiprocessing
import yaml
import io
from .proto import pipeline_service_pb2_grpc, pipeline_service_pb2
from . import operator
......@@ -233,6 +234,7 @@ class PipelineServer(object):
"local_service_conf": {
"workdir": "",
"thread_num": 2,
"device_type": -1,
"devices": "",
"mem_optim": True,
"ir_optim": False,
......@@ -333,7 +335,7 @@ class ServerYamlConfChecker(object):
raise SystemExit("Failed to prepare_server: only one of yml_file"
" or yml_dict can be selected as the parameter.")
if yml_file is not None:
with open(yml_file, encoding='utf-8') as f:
with io.open(yml_file, encoding='utf-8') as f:
conf = yaml.load(f.read())
elif yml_dict is not None:
conf = yml_dict
......@@ -388,6 +390,7 @@ class ServerYamlConfChecker(object):
default_conf = {
"workdir": "",
"thread_num": 2,
"device_type": -1,
"devices": "",
"mem_optim": True,
"ir_optim": False,
......@@ -396,6 +399,7 @@ class ServerYamlConfChecker(object):
"model_config": str,
"workdir": str,
"thread_num": int,
"device_type": int,
"devices": str,
"mem_optim": bool,
"ir_optim": bool,
......
......@@ -41,7 +41,7 @@ if '${PACK}' == 'ON':
copy_lib()
REQUIRED_PACKAGES = [
'six >= 1.10.0', 'sentencepiece<=0.1.92', 'opencv-python<=4.2.0.32', 'pillow',
'six >= 1.10.0', 'sentencepiece', 'opencv-python', 'pillow',
'pyclipper'
]
......
......@@ -174,7 +174,7 @@ function python_test_fit_a_line() {
# test web
unsetproxy # maybe the proxy is used on iPipe, which makes web-test failed.
check_cmd "python -m paddle_serving_server.serve --model uci_housing_model --name uci --port 9393 --thread 4 --name uci > /dev/null &"
check_cmd "python test_server.py > /dev/null &"
sleep 5 # wait for the server to start
check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
# check http code
......@@ -183,14 +183,6 @@ function python_test_fit_a_line() {
echo "HTTP status code -ne 200"
exit 1
fi
# test web batch
check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
# check http code
http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
if [ ${http_code} -ne 200 ]; then
echo "HTTP status code -ne 200"
exit 1
fi
setproxy # recover proxy state
kill_server_process
;;
......@@ -202,27 +194,6 @@ function python_test_fit_a_line() {
check_cmd "python test_client.py uci_housing_client/serving_client_conf.prototxt > /dev/null"
kill_server_process
# test web
#unsetproxy # maybe the proxy is used on iPipe, which makes web-test failed.
#check_cmd "python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9393 --thread 2 --gpu_ids 0 --name uci > /dev/null &"
#sleep 5 # wait for the server to start
#check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
# check http code
#http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
#if [ ${http_code} -ne 200 ]; then
# echo "HTTP status code -ne 200"
# exit 1
#fi
# test web batch
#check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
# check http code
#http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
#if [ ${http_code} -ne 200 ]; then
# echo "HTTP status code -ne 200"
# exit 1
#fi
#setproxy # recover proxy state
#kill_server_process
;;
*)
echo "error type"
......@@ -589,9 +560,6 @@ function python_test_grpc_impl() {
sleep 5 # wait for the server to start
check_cmd "python test_sync_client.py > /dev/null"
check_cmd "python test_asyn_client.py > /dev/null"
check_cmd "python test_general_pb_client.py > /dev/null"
check_cmd "python test_numpy_input_client.py > /dev/null"
check_cmd "python test_batch_client.py > /dev/null"
check_cmd "python test_timeout_client.py > /dev/null"
kill_server_process
kill_process_by_port 9393
......@@ -600,9 +568,6 @@ function python_test_grpc_impl() {
sleep 5 # wait for the server to start
check_cmd "python test_sync_client.py > /dev/null"
check_cmd "python test_asyn_client.py > /dev/null"
check_cmd "python test_general_pb_client.py > /dev/null"
check_cmd "python test_numpy_input_client.py > /dev/null"
check_cmd "python test_batch_client.py > /dev/null"
check_cmd "python test_timeout_client.py > /dev/null"
kill_server_process
kill_process_by_port 9393
......@@ -651,9 +616,7 @@ COMMENT
sleep 5 # wait for the server to start
check_cmd "python test_sync_client.py > /dev/null"
check_cmd "python test_asyn_client.py > /dev/null"
check_cmd "python test_general_pb_client.py > /dev/null"
check_cmd "python test_numpy_input_client.py > /dev/null"
check_cmd "python test_batch_client.py > /dev/null"
#check_cmd "python test_batch_client.py > /dev/null"
check_cmd "python test_timeout_client.py > /dev/null"
kill_server_process
kill_process_by_port 9393
......@@ -662,9 +625,7 @@ COMMENT
sleep 5 # wait for the server to start
check_cmd "python test_sync_client.py > /dev/null"
check_cmd "python test_asyn_client.py > /dev/null"
check_cmd "python test_general_pb_client.py > /dev/null"
check_cmd "python test_numpy_input_client.py > /dev/null"
check_cmd "python test_batch_client.py > /dev/null"
#check_cmd "python test_batch_client.py > /dev/null"
check_cmd "python test_timeout_client.py > /dev/null"
kill_server_process
kill_process_by_port 9393
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册