diff --git a/python/examples/bert/README.md b/python/examples/bert/README.md index bd2af745312f4668e8746bcb897bd55642ecff5f..0b9ec5649491165669579044e95def0e766bca1a 100644 --- a/python/examples/bert/README.md +++ b/python/examples/bert/README.md @@ -15,12 +15,18 @@ pip install paddlehub run ``` -python prepare_model.py 20 +python prepare_model.py 128 ``` -the 20 in the command above means max_seq_len in BERT model, which is the length of sample after preprocessing. -the config file and model file for server side are saved in the folder bert_seq20_model. -the config file generated for client side is saved in the folder bert_seq20_client. +the 128 in the command above means max_seq_len in BERT model, which is the length of sample after preprocessing. +the config file and model file for server side are saved in the folder bert_seq128_model. +the config file generated for client side is saved in the folder bert_seq128_client. + +You can also download the above model from BOS(max_seq_len=128). After decompression, the config file and model file for server side are stored in the bert_chinese_L-12_H-768_A-12_model folder, and the config file generated for client side is stored in the bert_chinese_L-12_H-768_A-12_client folder: +```shell +wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz +tar -xzf bert_chinese_L-12_H-768_A-12.tar.gz +``` ### Getting Dict and Sample Dataset @@ -32,11 +38,11 @@ this script will download Chinese Dictionary File vocab.txt and Chinese Sample D ### RPC Inference Service Run ``` -python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9292 #cpu inference service +python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 #cpu inference service ``` Or ``` -python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0 +python -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0 ``` ### RPC Inference @@ -47,7 +53,7 @@ pip install paddle_serving_app ``` Run ``` -head data-c.txt | python bert_client.py --model bert_seq20_client/serving_client_conf.prototxt +head data-c.txt | python bert_client.py --model bert_seq128_client/serving_client_conf.prototxt ``` the client reads data from data-c.txt and send prediction request, the prediction is given by word vector. (Due to massive data in the word vector, we do not print it). @@ -58,7 +64,7 @@ the client reads data from data-c.txt and send prediction request, the predictio ``` set environmental variable to specify which gpus are used, the command above means gpu 0 and gpu 1 is used. ``` - python bert_web_service.py bert_seq20_model/ 9292 #launch gpu inference service + python bert_web_service.py bert_seq128_model/ 9292 #launch gpu inference service ``` ### HTTP Inference @@ -75,7 +81,7 @@ GPU:GPU V100 * 1 CUDA/cudnn Version:CUDA 9.2,cudnn 7.1.4 -In the test, 10 thousand samples in the sample data are copied into 100 thousand samples. Each client thread sends a sample of the number of threads. The batch size is 1, the max_seq_len is 20, and the time unit is seconds. +In the test, 10 thousand samples in the sample data are copied into 100 thousand samples. Each client thread sends a sample of the number of threads. The batch size is 1, the max_seq_len is 20(not 128 as described above), and the time unit is seconds. When the number of client threads is 4, the prediction speed can reach 432 samples per second. Because a single GPU can only perform serial calculations internally, increasing the number of client threads can only reduce the idle time of the GPU. Therefore, after the number of threads reaches 4, the increase in the number of threads does not improve the prediction speed. diff --git a/python/examples/bert/README_CN.md b/python/examples/bert/README_CN.md index 305010baf4b39d9682f87ed597776950d6c36aa6..fb74b024113474f2ebc454f5ef341755135fea6b 100644 --- a/python/examples/bert/README_CN.md +++ b/python/examples/bert/README_CN.md @@ -13,11 +13,17 @@ pip install paddlehub ``` 执行 ``` -python prepare_model.py 20 +python prepare_model.py 128 +``` +参数128表示BERT模型中的max_seq_len,即预处理后的样本长度。 +生成server端配置文件与模型文件,存放在bert_seq128_model文件夹。 +生成client端配置文件,存放在bert_seq128_client文件夹。 + +您也可以从bos上直接下载上述模型(max_seq_len=128),解压后server端配置文件与模型文件存放在bert_chinese_L-12_H-768_A-12_model文件夹,client端配置文件存放在bert_chinese_L-12_H-768_A-12_client文件夹: +```shell +wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz +tar -xzf bert_chinese_L-12_H-768_A-12.tar.gz ``` -参数20表示BERT模型中的max_seq_len,即预处理后的样本长度。 -生成server端配置文件与模型文件,存放在bert_seq20_model文件夹 -生成client端配置文件,存放在bert_seq20_client文件夹 ### 获取词典和样例数据 @@ -29,11 +35,11 @@ sh get_data.sh ### 启动RPC预测服务 执行 ``` -python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9292 #启动cpu预测服务 +python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 #启动cpu预测服务 ``` 或者 ``` -python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务 +python -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务 ``` ### 执行预测 @@ -44,7 +50,7 @@ pip install paddle_serving_app ``` 执行 ``` -head data-c.txt | python bert_client.py --model bert_seq20_client/serving_client_conf.prototxt +head data-c.txt | python bert_client.py --model bert_seq128_client/serving_client_conf.prototxt ``` 启动client读取data-c.txt中的数据进行预测,预测结果为文本的向量表示(由于数据较多,脚本中没有将输出进行打印),server端的地址在脚本中修改。 @@ -54,7 +60,7 @@ head data-c.txt | python bert_client.py --model bert_seq20_client/serving_client ``` 通过环境变量指定gpu预测服务使用的gpu,示例中指定索引为0和1的两块gpu ``` - python bert_web_service.py bert_seq20_model/ 9292 #启动gpu预测服务 + python bert_web_service.py bert_seq128_model/ 9292 #启动gpu预测服务 ``` ### 执行预测 @@ -70,7 +76,7 @@ curl -H "Content-Type:application/json" -X POST -d '{"words": "hello", "fetch":[ 环境:CUDA 9.2,cudnn 7.1.4 -测试中将样例数据中的1W个样本复制为10W个样本,每个client线程发送线程数分之一个样本,batch size为1,max_seq_len为20,时间单位为秒. +测试中将样例数据中的1W个样本复制为10W个样本,每个client线程发送线程数分之一个样本,batch size为1,max_seq_len为20(而不是上面的128),时间单位为秒. 在client线程数为4时,预测速度可以达到432样本每秒。 由于单张GPU内部只能串行计算,client线程增多只能减少GPU的空闲时间,因此在线程数达到4之后,线程数增多对预测速度没有提升。 diff --git a/python/examples/bert/bert_client.py b/python/examples/bert/bert_client.py index 51364c6745731017b31923d246990497115dc780..b33a80d88fcc28200a61bc6125afcea0a0352dab 100644 --- a/python/examples/bert/bert_client.py +++ b/python/examples/bert/bert_client.py @@ -29,7 +29,7 @@ from paddle_serving_app import ChineseBertReader args = benchmark_args() -reader = ChineseBertReader({"max_seq_len": 20}) +reader = ChineseBertReader({"max_seq_len": 128}) fetch = ["pooled_output"] endpoint_list = ["127.0.0.1:9292"] client = Client() diff --git a/python/examples/bert/bert_web_service.py b/python/examples/bert/bert_web_service.py index 04462ca3b16fecf818aadad63b4f67a8d97014fd..e22e379d67e076d4712c8971b6d342b4eaceadb2 100644 --- a/python/examples/bert/bert_web_service.py +++ b/python/examples/bert/bert_web_service.py @@ -21,7 +21,7 @@ import os class BertService(WebService): def load(self): - self.reader = BertReader(vocab_file="vocab.txt", max_seq_len=20) + self.reader = BertReader(vocab_file="vocab.txt", max_seq_len=128) def preprocess(self, feed={}, fetch=[]): feed_res = self.reader.process(feed["words"].encode("utf-8")) diff --git a/python/paddle_serving_client/__init__.py b/python/paddle_serving_client/__init__.py index 1fc602cc28e857cf248b7c76ccaf0d602d6afb76..765c368adb42bf11529a2a89f509ce59464d2c90 100644 --- a/python/paddle_serving_client/__init__.py +++ b/python/paddle_serving_client/__init__.py @@ -158,8 +158,7 @@ class Client(object): ) else: if self.predictor_sdk_ is None: - timestamp = time.time() - self.add_variant('default_tag_{}'.format(timestamp), endpoints, + self.add_variant('default_tag_{}'.format(id(self)), endpoints, 100) else: print( diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py index 088e3928f4409eaac4d42d771a72ecc9d13fdbce..b57d2253dbe1f14caff50eb79543f224b8d0ec45 100644 --- a/python/paddle_serving_server/serve.py +++ b/python/paddle_serving_server/serve.py @@ -19,6 +19,7 @@ Usage: """ import argparse from .web_service import WebService +from flask import Flask, request def parse_args(): # pylint: disable=doc-string-missing @@ -88,3 +89,20 @@ if __name__ == "__main__": service.prepare_server( workdir=args.workdir, port=args.port, device=args.device) service.run_server() + + app_instance = Flask(__name__) + + @app_instance.before_first_request + def init(): + service._launch_web_service() + + service_name = "/" + service.name + "/prediction" + + @app_instance.route(service_name, methods=["POST"]) + def run(): + return service.get_prediction(request) + + app_instance.run(host="0.0.0.0", + port=service.port, + threaded=False, + processes=4) diff --git a/python/paddle_serving_server/web_service.py b/python/paddle_serving_server/web_service.py index e94916ccf371022544707e7bb8e03d37045e54b5..c1a86eaecc899c987bd346f8a747fb486d4789ee 100755 --- a/python/paddle_serving_server/web_service.py +++ b/python/paddle_serving_server/web_service.py @@ -50,44 +50,33 @@ class WebService(object): self.device = device def _launch_web_service(self): - app_instance = Flask(__name__) - client_service = Client() - client_service.load_client_config( + self.client_service = Client() + self.client_service.load_client_config( "{}/serving_server_conf.prototxt".format(self.model_config)) - client_service.connect(["0.0.0.0:{}".format(self.port + 1)]) - service_name = "/" + self.name + "/prediction" + self.client_service.connect(["0.0.0.0:{}".format(self.port + 1)]) - @app_instance.route(service_name, methods=['POST']) - def get_prediction(): - if not request.json: - abort(400) - if "fetch" not in request.json: - abort(400) - try: - feed, fetch = self.preprocess(request.json, - request.json["fetch"]) - if isinstance(feed, list): - fetch_map_batch = client_service.predict( - feed_batch=feed, fetch=fetch) - fetch_map_batch = self.postprocess( - feed=request.json, - fetch=fetch, - fetch_map=fetch_map_batch) - result = {"result": fetch_map_batch} - elif isinstance(feed, dict): - if "fetch" in feed: - del feed["fetch"] - fetch_map = client_service.predict(feed=feed, fetch=fetch) - result = self.postprocess( - feed=request.json, fetch=fetch, fetch_map=fetch_map) - except ValueError: - result = {"result": "Request Value Error"} - return result - - app_instance.run(host="0.0.0.0", - port=self.port, - threaded=False, - processes=1) + def get_prediction(self, request): + if not request.json: + abort(400) + if "fetch" not in request.json: + abort(400) + try: + feed, fetch = self.preprocess(request.json, request.json["fetch"]) + if isinstance(feed, list): + fetch_map_batch = self.client_service.predict( + feed_batch=feed, fetch=fetch) + fetch_map_batch = self.postprocess( + feed=request.json, fetch=fetch, fetch_map=fetch_map_batch) + result = {"result": fetch_map_batch} + elif isinstance(feed, dict): + if "fetch" in feed: + del feed["fetch"] + fetch_map = self.client_service.predict(feed=feed, fetch=fetch) + result = self.postprocess( + feed=request.json, fetch=fetch, fetch_map=fetch_map) + except ValueError: + result = {"result": "Request Value Error"} + return result def run_server(self): import socket @@ -96,11 +85,7 @@ class WebService(object): print("http://{}:{}/{}/prediction".format(localIP, self.port, self.name)) p_rpc = Process(target=self._launch_rpc_service) - p_web = Process(target=self._launch_web_service) p_rpc.start() - p_web.start() - p_web.join() - p_rpc.join() def preprocess(self, feed={}, fetch=[]): return feed, fetch diff --git a/python/paddle_serving_server_gpu/__init__.py b/python/paddle_serving_server_gpu/__init__.py index cfd5eee9ada15d1702be63af5f9cc09c85a57f0a..d4984c39df866dcca45daa45d9fc15feaaba8635 100644 --- a/python/paddle_serving_server_gpu/__init__.py +++ b/python/paddle_serving_server_gpu/__init__.py @@ -306,6 +306,9 @@ class Server(object): self.check_local_bin() if not self.use_local_bin: self.download_bin() + # wait for other process to download server bin + while not os.path.exists(self.server_path): + time.sleep(1) else: print("Use local bin : {}".format(self.bin_path)) command = "{} " \ @@ -337,8 +340,5 @@ class Server(object): self.gpuid,) print("Going to Run Comand") print(command) - # wait for other process to download server bin - while not os.path.exists(self.server_path): - time.sleep(1) os.system(command) diff --git a/python/paddle_serving_server_gpu/serve.py b/python/paddle_serving_server_gpu/serve.py index cb82e02cbec83324a6cb6029208325d8ce38e263..916af05ab6c6741b6504ce8f7660f6c7648c50f2 100644 --- a/python/paddle_serving_server_gpu/serve.py +++ b/python/paddle_serving_server_gpu/serve.py @@ -21,6 +21,7 @@ import argparse import os from multiprocessing import Pool, Process from paddle_serving_server_gpu import serve_args +from flask import Flask, request def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-missing @@ -114,3 +115,20 @@ if __name__ == "__main__": web_service.prepare_server( workdir=args.workdir, port=args.port, device=args.device) web_service.run_server() + + app_instance = Flask(__name__) + + @app_instance.before_first_request + def init(): + web_service._launch_web_service() + + service_name = "/" + web_service.name + "/prediction" + + @app_instance.route(service_name, methods=["POST"]) + def run(): + return web_service.get_prediction(request) + + app_instance.run(host="0.0.0.0", + port=web_service.port, + threaded=False, + processes=4) diff --git a/python/paddle_serving_server_gpu/web_service.py b/python/paddle_serving_server_gpu/web_service.py index 5d507c9475047d6c7eb65a2b2c5799221cf194b5..1bb8e93b24117c7545245809fab21af53af22dce 100755 --- a/python/paddle_serving_server_gpu/web_service.py +++ b/python/paddle_serving_server_gpu/web_service.py @@ -11,17 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#!flask/bin/python -# pylint: disable=doc-string-missing from flask import Flask, request, abort -from multiprocessing import Pool, Process, Queue from paddle_serving_server_gpu import OpMaker, OpSeqMaker, Server import paddle_serving_server_gpu as serving +from multiprocessing import Pool, Process, Queue from paddle_serving_client import Client -from .serve import start_multi_card -import time -import random +from paddle_serving_server_gpu.serve import start_multi_card + +import sys +import numpy as np class WebService(object): @@ -29,7 +28,6 @@ class WebService(object): self.name = name self.gpus = [] self.rpc_service_list = [] - self.input_queues = [] def load_model_config(self, model_config): self.model_config = model_config @@ -66,12 +64,6 @@ class WebService(object): return server def _launch_rpc_service(self, service_idx): - if service_idx == 0: - self.rpc_service_list[service_idx].check_local_bin() - if not self.rpc_service_list[service_idx].use_local_bin: - self.rpc_service_list[service_idx].download_bin() - else: - time.sleep(3) self.rpc_service_list[service_idx].run_server() def prepare_server(self, workdir="", port=9393, device="gpu", gpuid=0): @@ -93,87 +85,30 @@ class WebService(object): gpuid, thread_num=10)) - def producers(self, inputqueue, endpoint): - client = Client() - client.load_client_config("{}/serving_server_conf.prototxt".format( + def _launch_web_service(self): + gpu_num = len(self.gpus) + self.client = Client() + self.client.load_client_config("{}/serving_server_conf.prototxt".format( self.model_config)) - client.connect([endpoint]) - while True: - request_json = inputqueue.get() - try: - feed, fetch = self.preprocess(request_json, - request_json["fetch"]) - if isinstance(feed, list): - fetch_map_batch = client.predict( - feed_batch=feed, fetch=fetch) - fetch_map_batch = self.postprocess( - feed=request_json, - fetch=fetch, - fetch_map=fetch_map_batch) - result = {"result": fetch_map_batch} - elif isinstance(feed, dict): - if "fetch" in feed: - del feed["fetch"] - fetch_map = client.predict(feed=feed, fetch=fetch) - result = self.postprocess( - feed=request_json, fetch=fetch, fetch_map=fetch_map) - self.output_queue.put(result) - except ValueError: - self.output_queue.put(-1) - - def _launch_web_service(self, gpu_num): - app_instance = Flask(__name__) - service_name = "/" + self.name + "/prediction" - - self.input_queues = [] - self.output_queue = Queue() - for i in range(gpu_num): - self.input_queues.append(Queue()) - - producer_list = [] - for i, input_q in enumerate(self.input_queues): - producer_processes = Process( - target=self.producers, - args=( - input_q, - "0.0.0.0:{}".format(self.port + 1 + i), )) - producer_list.append(producer_processes) - - for p in producer_list: - p.start() - - client = Client() - client.load_client_config("{}/serving_server_conf.prototxt".format( - self.model_config)) - client.connect(["0.0.0.0:{}".format(self.port + 1)]) - - self.idx = 0 - - @app_instance.route(service_name, methods=['POST']) - def get_prediction(): - if not request.json: - abort(400) - if "fetch" not in request.json: - abort(400) - - self.input_queues[self.idx].put(request.json) - - #self.input_queues[0].put(request.json) - self.idx += 1 - if self.idx >= len(self.gpus): - self.idx = 0 - result = self.output_queue.get() - if not isinstance(result, dict) and result == -1: - result = {"result": "Request Value Error"} - return result - - app_instance.run(host="0.0.0.0", - port=self.port, - threaded=False, - processes=1) - - for p in producer_list: - p.join() + endpoints = "" + if gpu_num > 0: + for i in range(gpu_num): + endpoints += "127.0.0.1:{},".format(self.port + i + 1) + else: + endpoints = "127.0.0.1:{}".format(self.port + 1) + self.client.connect([endpoints]) + + def get_prediction(self, request): + if not request.json: + abort(400) + if "fetch" not in request.json: + abort(400) + feed, fetch = self.preprocess(request.json, request.json["fetch"]) + fetch_map_batch = self.client.predict(feed=feed, fetch=fetch) + fetch_map_batch = self.postprocess( + feed=request.json, fetch=fetch, fetch_map=fetch_map_batch) + result = {"result": fetch_map_batch} + return result def run_server(self): import socket @@ -188,13 +123,6 @@ class WebService(object): for p in server_pros: p.start() - p_web = Process( - target=self._launch_web_service, args=(len(self.gpus), )) - p_web.start() - p_web.join() - for p in server_pros: - p.join() - def preprocess(self, feed={}, fetch=[]): return feed, fetch diff --git a/tools/Dockerfile b/tools/Dockerfile index 69b9b8bec4be49d6d4b1a5d8eb3fe5550ac1fa15..dc39adf01288f092143803557b322a0c8fbcb2b4 100644 --- a/tools/Dockerfile +++ b/tools/Dockerfile @@ -3,6 +3,9 @@ FROM centos:7.3.1611 RUN yum -y install wget && \ yum -y install epel-release && yum -y install patchelf && \ yum -y install gcc make python-devel && \ + yum -y install libSM-1.2.2-2.el7.x86_64 --setopt=protected_multilib=false && \ + yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false && \ + yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false && \ yum -y install python3 python3-devel && \ yum clean all && \ curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ diff --git a/tools/Dockerfile.ci b/tools/Dockerfile.ci index d53cee6b7389434afa07526682d84e2366ec16f5..8709075f6cf8f985e346999e76f6b273d7664193 100644 --- a/tools/Dockerfile.ci +++ b/tools/Dockerfile.ci @@ -2,6 +2,9 @@ FROM centos:7.3.1611 RUN yum -y install wget >/dev/null \ && yum -y install gcc gcc-c++ make glibc-static which >/dev/null \ && yum -y install git openssl-devel curl-devel bzip2-devel python-devel >/dev/null \ + && yum -y install libSM-1.2.2-2.el7.x86_64 --setopt=protected_multilib=false \ + && yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false \ + && yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false \ && wget https://cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.tar.gz >/dev/null \ && tar xzf cmake-3.2.0-Linux-x86_64.tar.gz \ && mv cmake-3.2.0-Linux-x86_64 /usr/local/cmake3.2.0 \ diff --git a/tools/serving_build.sh b/tools/serving_build.sh index 6549838a11c8d9c119762c3429a06cae57fe31b6..5934cbca883bd6a40369bd0875b2789ceb8a4d1f 100644 --- a/tools/serving_build.sh +++ b/tools/serving_build.sh @@ -164,6 +164,7 @@ function python_test_fit_a_line() { fi ;; GPU) + export CUDA_VISIBLE_DEVICES=0 # test rpc check_cmd "python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9393 --thread 4 --gpu_ids 0 > /dev/null &" sleep 5 # wait for the server to start @@ -226,7 +227,7 @@ function python_run_criteo_ctr_with_cube() { exit 1 fi echo "criteo_ctr_with_cube inference auc test success" - ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill + kill_server_process ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill ;; GPU) @@ -253,7 +254,7 @@ function python_run_criteo_ctr_with_cube() { exit 1 fi echo "criteo_ctr_with_cube inference auc test success" - ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill + kill_server_process ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill ;; *) @@ -276,27 +277,48 @@ function python_test_bert() { case $TYPE in CPU) pip install paddlehub - python prepare_model.py 20 + # Because download from paddlehub may timeout, + # download the model from bos(max_seq_len=128). + wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz + tar -xzf bert_chinese_L-12_H-768_A-12.tar.gz sh get_data.sh - check_cmd "python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9292 &" + check_cmd "python -m paddle_serving_server.serve --model bert_chinese_L-12_H-768_A-12_model --port 9292 &" sleep 5 pip install paddle_serving_app - check_cmd "head -n 10 data-c.txt | python bert_client.py --model bert_seq20_client/serving_client_conf.prototxt" + check_cmd "head -n 10 data-c.txt | python bert_client.py --model bert_chinese_L-12_H-768_A-12_client/serving_client_conf.prototxt" kill_server_process - ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill - ps -ef | grep "serving" | grep -v grep | awk '{print $2}' | xargs kill + # python prepare_model.py 20 + # sh get_data.sh + # check_cmd "python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9292 &" + # sleep 5 + # pip install paddle_serving_app + # check_cmd "head -n 10 data-c.txt | python bert_client.py --model bert_seq20_client/serving_client_conf.prototxt" + # kill_server_process + # ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill + # ps -ef | grep "serving" | grep -v grep | awk '{print $2}' | xargs kill echo "bert RPC inference pass" ;; GPU) + export CUDA_VISIBLE_DEVICES=0 pip install paddlehub - python prepare_model.py 20 + # Because download from paddlehub may timeout, + # download the model from bos(max_seq_len=128). + wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz + tar -xzf bert_chinese_L-12_H-768_A-12.tar.gz sh get_data.sh - check_cmd "python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9292 --gpu_ids 0 &" + check_cmd "python -m paddle_serving_server_gpu.serve --model bert_chinese_L-12_H-768_A-12_model --port 9292 --gpu_ids 0 &" sleep 5 pip install paddle_serving_app - check_cmd "head -n 10 data-c.txt | python bert_client.py --model bert_seq20_client/serving_client_conf.prototxt" + check_cmd "head -n 10 data-c.txt | python bert_client.py --model bert_chinese_L-12_H-768_A-12_client/serving_client_conf.prototxt" kill_server_process - ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill + # python prepare_model.py 20 + # sh get_data.sh + # check_cmd "python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9292 --gpu_ids 0 &" + # sleep 5 + # pip install paddle_serving_app + # check_cmd "head -n 10 data-c.txt | python bert_client.py --model bert_seq20_client/serving_client_conf.prototxt" + # kill_server_process + # ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill echo "bert RPC inference pass" ;; *) @@ -325,9 +347,10 @@ function python_test_imdb() { check_cmd "python text_classify_service.py imdb_cnn_model/workdir/9292 imdb.vocab &" sleep 5 check_cmd "curl -H "Content-Type:application/json" -X POST -d '{"words": "i am very sad | 0", "fetch":["prediction"]}' http://127.0.0.1:9292/imdb/prediction" + kill_server_process ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill ps -ef | grep "text_classify_service.py" | grep -v grep | awk '{print $2}' | xargs kill - echo "imdb CPU HTTP inference pass" + echo "imdb CPU HTTP inference pass" ;; GPU) echo "imdb ignore GPU test" @@ -356,6 +379,7 @@ function python_test_lac() { check_cmd "python lac_web_service.py jieba_server_model/ lac_workdir 9292 &" sleep 5 check_cmd "curl -H "Content-Type:application/json" -X POST -d '{"words": "我爱北京天安门", "fetch":["word_seg"]}' http://127.0.0.1:9292/lac/prediction" + kill_server_process ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill ps -ef | grep "lac_web_service" | grep -v grep | awk '{print $2}' | xargs kill echo "lac CPU HTTP inference pass" @@ -377,7 +401,7 @@ function python_run_test() { python_test_fit_a_line $TYPE # pwd: /Serving/python/examples python_run_criteo_ctr_with_cube $TYPE # pwd: /Serving/python/examples python_test_bert $TYPE # pwd: /Serving/python/examples - python_test_imdb $TYPE + python_test_imdb $TYPE # pwd: /Serving/python/examples python_test_lac $TYPE echo "test python $TYPE part finished as expected." cd ../.. # pwd: /Serving