diff --git a/README.md b/README.md index efa6765dcc295ed27bb44d6d014a951f31d8cc84..870f2f587f35ba1526c49f58698ae4db17ff0f81 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po | `port` | int | `9292` | Exposed port of current service to users| | `name` | str | `""` | Service name, can be used to generate HTTP request url | | `model` | str | `""` | Path of paddle model directory to be served | -| `mem_optim` | - | - | Enable memory / graphic memory optimization | +| `mem_optim_off` | - | - | Disable memory / graphic memory optimization | | `ir_optim` | - | - | Enable analysis and optimization of calculation graph | | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL | diff --git a/README_CN.md b/README_CN.md index 6ab74ad358e42134bde0aff25548900f6af90c73..6317a79513a3d5e3247d249885d8bfe06de0e1c9 100644 --- a/README_CN.md +++ b/README_CN.md @@ -120,7 +120,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po | `port` | int | `9292` | Exposed port of current service to users| | `name` | str | `""` | Service name, can be used to generate HTTP request url | | `model` | str | `""` | Path of paddle model directory to be served | -| `mem_optim` | - | - | Enable memory optimization | +| `mem_optim_off` | - | - | Disable memory optimization | | `ir_optim` | - | - | Enable analysis and optimization of calculation graph | | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL | diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index f5ef70379a5562617e77a9e2ff46587cd48a0f6c..39412f6950b7d4fe71f294079b69707b202f0876 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -40,8 +40,8 @@ ExternalProject_Add( extern_brpc ${EXTERNAL_PROJECT_LOG_ARGS} # TODO(gongwb): change to de newst repo when they changed. - GIT_REPOSITORY "https://github.com/gongweibao/brpc" - GIT_TAG "e9b67ec1b7458f2af5fae76451afe1e27e01b4b4" + GIT_REPOSITORY "https://github.com/wangjiawei04/brpc" + GIT_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47" PREFIX ${BRPC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/doc/FAQ.md b/doc/FAQ.md index 3bdd2dfd4739b54bf39b6b3f561c43bab3edabde..eb4f05a28594effcf59aac880cf4d81846a3a925 100644 --- a/doc/FAQ.md +++ b/doc/FAQ.md @@ -12,4 +12,7 @@ client.load_client_config(sys.argv[1]) client.set_rpc_timeout_ms(100000) client.connect(["127.0.0.1:9393"]) - ``` + ``` + +- Q: 如何使用自己编译的Paddle Serving进行预测? + A:通过pip命令安装自己编译出的whl包,并设置SERVING_BIN环境变量为编译出的serving二进制文件路径。 diff --git a/doc/LATEST_PACKAGES.md b/doc/LATEST_PACKAGES.md index 98bef9246dea4a8b112699f8ed54d969ea32864d..038641afd38192da5b99f714d278232d3ad79fb4 100644 --- a/doc/LATEST_PACKAGES.md +++ b/doc/LATEST_PACKAGES.md @@ -14,11 +14,17 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.3.2-py2-none-an ## GPU server ### Python 3 ``` -https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2-py3-none-any.whl +#cuda 9.0 +https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post9-py3-none-any.whl +#cuda 10.0 +https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post10-py3-none-any.whl ``` ### Python 2 ``` -https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2-py2-none-any.whl +#cuda 9.0 +https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post9-py2-none-any.whl +#cuda 10.0 +https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post10-py2-none-any.whl ``` ## Client diff --git a/doc/PERFORMANCE_OPTIM.md b/doc/PERFORMANCE_OPTIM.md index 651be1c139b5960fa287fc3e981f3039f9f098a2..e87e9541cccadf318821807aa63ca4b0e6809a1b 100644 --- a/doc/PERFORMANCE_OPTIM.md +++ b/doc/PERFORMANCE_OPTIM.md @@ -14,7 +14,35 @@ Under the same conditions, the communication time of the HTTP prediction service Parameters for performance optimization: +The memory/graphic memory optimization option is enabled by default in Paddle Serving, which can reduce the memory/video memory usage and usually does not affect performance. If you need to turn it off, you can use --mem_optim_off in the command line. + +r_optim can optimize the calculation graph and increase the inference speed. It is turned off by default and turned on by --ir_optim in the command line. + | Parameters | Type | Default | Description | | ---------- | ---- | ------- | ------------------------------------------------------------ | -| mem_optim | - | - | Enable memory / graphic memory optimization | +| mem_optim_off | - | - | Disable memory / graphic memory optimization | | ir_optim | - | - | Enable analysis and optimization of calculation graph,including OP fusion, etc | + + +For the mode of using Python code to start the prediction service, the API of the above two parameters is as follows: + +RPC Service +``` +from paddle_serving_server import Server +server = Server() +... +server.set_memory_optimize(mem_optim) +server.set_ir_optimize(ir_optim) +... +``` + +HTTP Service +``` +from paddle_serving_server import WebService +class NewService(WebService): +... +new_service = NewService(name="new") +... +new_service.prepare_server(mem_optim=True, ir_optim=False) +... +``` diff --git a/doc/PERFORMANCE_OPTIM_CN.md b/doc/PERFORMANCE_OPTIM_CN.md index c35ea7a11c40ad2a5752d9add8fd8d9f8ddb2b64..2fa5bdef1bee1cdc9e9daceaf853403485a06b84 100644 --- a/doc/PERFORMANCE_OPTIM_CN.md +++ b/doc/PERFORMANCE_OPTIM_CN.md @@ -14,7 +14,33 @@ 性能优化相关参数: +Paddle Serving中默认开启内存/显存优化选项,可以减少对内存/显存的占用,通常不会对性能造成影响,如果需要关闭可以在命令行启动模式中使用--mem_optim_off。 +ir_optim可以优化计算图,提升推理速度,默认关闭,在命令行启动的模式中通过--ir_optim开启。 + | 参数 | 类型 | 默认值 | 含义 | | --------- | ---- | ------ | -------------------------------- | -| mem_optim | - | - | 开启内存/显存优化 | +| mem_optim_off | - | - | 关闭内存/显存优化 | | ir_optim | - | - | 开启计算图分析优化,包括OP融合等 | + + +对于使用Python代码启动预测服务的模式,以上两个参数的接口如下: +RPC服务 +``` +from paddle_serving_server import Server +server = Server() +... +server.set_memory_optimize(mem_optim) +server.set_ir_optimize(ir_optim) +... +``` + +HTTP服务 +``` +from paddle_serving_server import WebService +class NewService(WebService): +... +new_service = NewService(name="new") +... +new_service.prepare_server(mem_optim=True, ir_optim=False) +... +``` diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 098453a2da2411f5bb83cbdd248898e8879a3922..edec41573b67f50feca52ee017bae2d7fa2b28ac 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -83,6 +83,7 @@ if (SERVER) OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/ + COMMAND env ${py_env} ${PYTHON_EXECUTABLE} paddle_serving_server_gpu/gen_cuda_version.py ${CUDA_VERSION_MAJOR} COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES}) add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp) diff --git a/python/examples/imagenet/resnet50_web_service.py b/python/examples/imagenet/resnet50_web_service.py index 1dbb2be90da5192baf8b274e9dd3255179842f6d..e7d1914973f2aeb58a912f7d85e35f85718d7a9b 100644 --- a/python/examples/imagenet/resnet50_web_service.py +++ b/python/examples/imagenet/resnet50_web_service.py @@ -54,6 +54,7 @@ class ImageService(WebService): score_list = fetch_map["score"] result = {"label": [], "prob": []} for score in score_list: + score = score.tolist() max_score = max(score) result["label"].append(self.label_dict[score.index(max_score)] .strip().replace(",", "")) diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py index 009a6ce00af2290b64716e211429385d09189831..704cf0304adf1ac647c244063c2b23049f92b221 100644 --- a/python/paddle_serving_server/serve.py +++ b/python/paddle_serving_server/serve.py @@ -40,7 +40,7 @@ def parse_args(): # pylint: disable=doc-string-missing parser.add_argument( "--device", type=str, default="cpu", help="Type of device") parser.add_argument( - "--mem_optim", + "--mem_optim_off", default=False, action="store_true", help="Memory optimize") @@ -68,7 +68,7 @@ def start_standard_model(): # pylint: disable=doc-string-missing port = args.port workdir = args.workdir device = args.device - mem_optim = args.mem_optim + mem_optim = args.mem_optim_off is False ir_optim = args.ir_optim max_body_size = args.max_body_size use_mkl = args.use_mkl diff --git a/python/paddle_serving_server/web_service.py b/python/paddle_serving_server/web_service.py index d9b9e3f1b1dcfa9502096d0eab4e3be61d2bbaa6..b0c1b79bda5041b4eca114d778a23d3a123c226e 100755 --- a/python/paddle_serving_server/web_service.py +++ b/python/paddle_serving_server/web_service.py @@ -41,6 +41,8 @@ class WebService(object): server = Server() server.set_op_sequence(op_seq_maker.get_op_sequence()) server.set_num_threads(16) + server.set_memory_optimize(self.mem_optim) + server.set_ir_optimize(self.ir_optim) server.load_model_config(self.model_config) server.prepare_server( workdir=self.workdir, port=self.port_list[0], device=self.device) @@ -55,12 +57,19 @@ class WebService(object): else: return False - def prepare_server(self, workdir="", port=9393, device="cpu"): + def prepare_server(self, + workdir="", + port=9393, + device="cpu", + mem_optim=True, + ir_optim=False): self.workdir = workdir self.port = port self.device = device default_port = 12000 self.port_list = [] + self.mem_optim = mem_optim + self.ir_optim = ir_optim for i in range(1000): if self.port_is_available(default_port + i): self.port_list.append(default_port + i) @@ -83,8 +92,6 @@ class WebService(object): if isinstance(feed, dict) and "fetch" in feed: del feed["fetch"] fetch_map = self.client.predict(feed=feed, fetch=fetch) - for key in fetch_map: - fetch_map[key] = fetch_map[key].tolist() result = self.postprocess( feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map) result = {"result": result} @@ -128,4 +135,6 @@ class WebService(object): return feed, fetch def postprocess(self, feed=[], fetch=[], fetch_map=None): + for key in fetch_map: + fetch_map[key] = fetch_map[key].tolist() return fetch_map diff --git a/python/paddle_serving_server_gpu/__init__.py b/python/paddle_serving_server_gpu/__init__.py index 0261003a7863d11fb342d1572b124d1cbb533a2b..1d94bf3093e6d76b260f53acd0c799080627c0ab 100644 --- a/python/paddle_serving_server_gpu/__init__.py +++ b/python/paddle_serving_server_gpu/__init__.py @@ -41,7 +41,7 @@ from concurrent import futures def serve_args(): parser = argparse.ArgumentParser("serve") parser.add_argument( - "--thread", type=int, default=10, help="Concurrency of server") + "--thread", type=int, default=2, help="Concurrency of server") parser.add_argument( "--model", type=str, default="", help="Model for serving") parser.add_argument( @@ -57,7 +57,7 @@ def serve_args(): parser.add_argument( "--name", type=str, default="None", help="Default service name") parser.add_argument( - "--mem_optim", + "--mem_optim_off", default=False, action="store_true", help="Memory optimize") @@ -187,7 +187,7 @@ class Server(object): self.cube_config_fn = "cube.conf" self.workdir = "" self.max_concurrency = 0 - self.num_threads = 4 + self.num_threads = 2 self.port = 8080 self.reload_interval_s = 10 self.max_body_size = 64 * 1024 * 1024 @@ -363,7 +363,15 @@ class Server(object): def download_bin(self): os.chdir(self.module_path) need_download = False - device_version = "serving-gpu-" + + #acquire lock + version_file = open("{}/version.py".format(self.module_path), "r") + import re + for line in version_file.readlines(): + if re.match("cuda_version", line): + cuda_version = line.split("\"")[1] + device_version = "serving-gpu-cuda" + cuda_version + "-" + folder_name = device_version + serving_server_version tar_name = folder_name + ".tar.gz" bin_url = "https://paddle-serving.bj.bcebos.com/bin/" + tar_name @@ -372,8 +380,6 @@ class Server(object): download_flag = "{}/{}.is_download".format(self.module_path, folder_name) - #acquire lock - version_file = open("{}/version.py".format(self.module_path), "r") fcntl.flock(version_file, fcntl.LOCK_EX) if os.path.exists(download_flag): @@ -385,6 +391,7 @@ class Server(object): os.system("touch {}/{}.is_download".format(self.module_path, folder_name)) print('Frist time run, downloading PaddleServing components ...') + r = os.system('wget ' + bin_url + ' --no-check-certificate') if r != 0: if os.path.exists(tar_name): diff --git a/python/paddle_serving_server_gpu/gen_cuda_version.py b/python/paddle_serving_server_gpu/gen_cuda_version.py new file mode 100644 index 0000000000000000000000000000000000000000..4a320a0e4dd9f9145a2c7682d5eecb7f582862b5 --- /dev/null +++ b/python/paddle_serving_server_gpu/gen_cuda_version.py @@ -0,0 +1,27 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import re +import os + +new_str = "" +with open("paddle_serving_server_gpu/version.py", "r") as f: + for line in f.readlines(): + if re.match("cuda_version", line): + line = re.sub(r"\d+", sys.argv[1], line) + new_str = new_str + line + +with open("paddle_serving_server_gpu/version.py", "w") as f: + f.write(new_str) diff --git a/python/paddle_serving_server_gpu/serve.py b/python/paddle_serving_server_gpu/serve.py index e26b32c2699d09b714b2658cafad0ae8c5138071..3b0941a97560f11a52808fc7e152419e2cec0ba0 100644 --- a/python/paddle_serving_server_gpu/serve.py +++ b/python/paddle_serving_server_gpu/serve.py @@ -34,7 +34,7 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss port = args.port + index thread_num = args.thread model = args.model - mem_optim = args.mem_optim + mem_optim = args.mem_optim_off is False ir_optim = args.ir_optim max_body_size = args.max_body_size use_multilang = args.use_multilang diff --git a/python/paddle_serving_server_gpu/version.py b/python/paddle_serving_server_gpu/version.py index f7fc14b2a7f0c25b471e8d3bb44e9d6db6839d01..2272c3aa91f999697ea8ef3e2cdb585b01db8bed 100644 --- a/python/paddle_serving_server_gpu/version.py +++ b/python/paddle_serving_server_gpu/version.py @@ -15,3 +15,4 @@ serving_client_version = "0.3.2" serving_server_version = "0.3.2" module_proto_version = "0.3.2" +cuda_version = "9" diff --git a/python/paddle_serving_server_gpu/web_service.py b/python/paddle_serving_server_gpu/web_service.py index 2f848b1bf707aba17213d75facba6dea186a2351..5e9fdf4f4fda84dfb7c4f598fae6cf2381c377ca 100644 --- a/python/paddle_serving_server_gpu/web_service.py +++ b/python/paddle_serving_server_gpu/web_service.py @@ -41,7 +41,9 @@ class WebService(object): workdir="conf", port=9292, gpuid=0, - thread_num=10): + thread_num=2, + mem_optim=True, + ir_optim=False): device = "gpu" if gpuid == -1: device = "cpu" @@ -58,6 +60,8 @@ class WebService(object): server = Server() server.set_op_sequence(op_seq_maker.get_op_sequence()) server.set_num_threads(thread_num) + server.set_memory_optimize(mem_optim) + server.set_ir_optimize(ir_optim) server.load_model_config(self.model_config) if gpuid >= 0: @@ -77,7 +81,13 @@ class WebService(object): else: return False - def prepare_server(self, workdir="", port=9393, device="gpu", gpuid=0): + def prepare_server(self, + workdir="", + port=9393, + device="gpu", + gpuid=0, + mem_optim=True, + ir_optim=False): self.workdir = workdir self.port = port self.device = device @@ -94,7 +104,12 @@ class WebService(object): # init cpu service self.rpc_service_list.append( self.default_rpc_service( - self.workdir, self.port_list[0], -1, thread_num=10)) + self.workdir, + self.port_list[0], + -1, + thread_num=2, + mem_optim=mem_optim, + ir_optim=ir_optim)) else: for i, gpuid in enumerate(self.gpus): self.rpc_service_list.append( @@ -102,7 +117,9 @@ class WebService(object): "{}_{}".format(self.workdir, i), self.port_list[i], gpuid, - thread_num=10)) + thread_num=2, + mem_optim=mem_optim, + ir_optim=ir_optim)) def _launch_web_service(self): gpu_num = len(self.gpus) diff --git a/python/setup.py.server_gpu.in b/python/setup.py.server_gpu.in index 65dec4621fceba3967ff21814b218c0229a5124b..4554c1d368f70a32d16ceeabb54d63625f9f256d 100644 --- a/python/setup.py.server_gpu.in +++ b/python/setup.py.server_gpu.in @@ -41,7 +41,6 @@ REQUIRED_PACKAGES = [ 'paddle_serving_client', 'flask >= 1.1.1', 'paddle_serving_app' ] - packages=['paddle_serving_server_gpu', 'paddle_serving_server_gpu.proto', 'paddle_serving_server_gpu.pipeline', @@ -58,7 +57,7 @@ package_dir={'paddle_serving_server_gpu': setup( name='paddle-serving-server-gpu', - version=serving_server_version.replace('-', ''), + version=serving_server_version.replace('-', '') + '.post@CUDA_VERSION_MAJOR@', description= ('Paddle Serving Package for saved model with PaddlePaddle'), url='https://github.com/PaddlePaddle/Serving', diff --git a/tools/Dockerfile.cuda10.0-cudnn7.devel b/tools/Dockerfile.cuda10.0-cudnn7.devel index 8021ef31f05622cec6fb3aff681feb5107d2be2c..b46f9b96cf0d081cf9cdfc12cb46be037677ac86 100644 --- a/tools/Dockerfile.cuda10.0-cudnn7.devel +++ b/tools/Dockerfile.cuda10.0-cudnn7.devel @@ -1,4 +1,4 @@ -FROM nvidia/cuda:10.0-cudnn7-runtime-centos7 +FROM nvidia/cuda:10.0-cudnn7-devel-centos7 RUN yum -y install wget >/dev/null \ && yum -y install gcc gcc-c++ make glibc-static which \