From 65591ef9e874b5dd20f322f3bf20ea15523c882a Mon Sep 17 00:00:00 2001 From: wangjiawei04 Date: Tue, 16 Mar 2021 16:26:38 +0000 Subject: [PATCH] fix bert and ocr benchmark --- python/examples/pipeline/bert/benchmark.py | 49 +++++++++- python/examples/pipeline/bert/benchmark.sh | 81 ++++++++-------- python/examples/pipeline/ocr/benchmark.py | 101 ++++++++++++++++++++ python/examples/pipeline/ocr/benchmark.sh | 59 ++++++++++++ python/examples/pipeline/ocr/web_service.py | 22 +++-- python/pipeline/profiler.py | 25 ++++- 6 files changed, 282 insertions(+), 55 deletions(-) create mode 100644 python/examples/pipeline/ocr/benchmark.py create mode 100644 python/examples/pipeline/ocr/benchmark.sh diff --git a/python/examples/pipeline/bert/benchmark.py b/python/examples/pipeline/bert/benchmark.py index 47bf7484..3dece491 100644 --- a/python/examples/pipeline/bert/benchmark.py +++ b/python/examples/pipeline/bert/benchmark.py @@ -11,12 +11,50 @@ except ImportError: import numpy as np from paddle_serving_client.utils import MultiThreadRunner from paddle_serving_client.utils import benchmark_args, show_latency +''' +2021-03-16 10:26:01,832 ==================== TRACER ====================== +2021-03-16 10:26:01,838 Op(bert): +2021-03-16 10:26:01,838 in[5.7833 ms] +2021-03-16 10:26:01,838 prep[8.2001 ms] +2021-03-16 10:26:01,838 midp[198.79853333333332 ms] +2021-03-16 10:26:01,839 postp[0.8411 ms] +2021-03-16 10:26:01,839 out[0.9440666666666667 ms] +2021-03-16 10:26:01,839 idle[0.03135320683677345] +2021-03-16 10:26:01,839 DAGExecutor: +2021-03-16 10:26:01,839 Query count[30] +2021-03-16 10:26:01,839 QPS[3.0 q/s] +2021-03-16 10:26:01,839 Succ[1.0] +2021-03-16 10:26:01,839 Error req[] +2021-03-16 10:26:01,839 Latency: +2021-03-16 10:26:01,839 ave[237.85519999999997 ms] +2021-03-16 10:26:01,839 .50[179.937 ms] +2021-03-16 10:26:01,839 .60[179.994 ms] +2021-03-16 10:26:01,839 .70[180.515 ms] +2021-03-16 10:26:01,840 .80[180.735 ms] +2021-03-16 10:26:01,840 .90[182.275 ms] +2021-03-16 10:26:01,840 .95[182.789 ms] +2021-03-16 10:26:01,840 .99[1921.33 ms] +2021-03-16 10:26:01,840 Channel (server worker num[1]): +2021-03-16 10:26:01,840 chl0(In: ['@DAGExecutor'], Out: ['bert']) size[0/0] +2021-03-16 10:26:01,841 chl1(In: ['bert'], Out: ['@DAGExecutor']) size[0/0] +''' +def parse_benchmark(filein, fileout): + with open(filein, "r") as fin: + res = yaml.load(fin) + del_list = [] + for key in res["DAG"].keys(): + if "call" in key: + del_list.append(key) + for key in del_list: + del res["DAG"][key] + with open(fileout, "w") as fout: + yaml.dump(res, fout, default_flow_style=False) def gen_yml(device): fin = open("config.yml", "r") config = yaml.load(fin) fin.close() - config["dag"]["tracer"] = {"interval_s": 20} + config["dag"]["tracer"] = {"interval_s": 10} if device == "gpu": config["op"]["bert"]["local_service_conf"]["device_type"] = 1 config["op"]["bert"]["local_service_conf"]["devices"] = "2" @@ -41,6 +79,8 @@ def run_http(idx, batch_size): data = {"key": keys, "value": values} r = requests.post(url=url, data=json.dumps(data)) start_idx += batch_size + if start_idx > 2000: + break end = time.time() return [[end - start]] @@ -62,6 +102,8 @@ def run_rpc(thread, batch_size): feed[str(i - start_idx)] = lines[i] ret = client.predict(feed_dict=feed, fetch=["res"]) start_idx += batch_size + if start_idx > 1000: + break end = time.time() return [[end - start]] @@ -84,5 +126,8 @@ if __name__ == "__main__": multithread_http(thread, batch_size) elif mode == "rpc": multithread_rpc(thread, batch_size) - + elif sys.argv[1] == "dump": + filein = sys.argv[2] + fileout = sys.argv[3] + parse_benchmark(filein, fileout) diff --git a/python/examples/pipeline/bert/benchmark.sh b/python/examples/pipeline/bert/benchmark.sh index 4671ef09..fa5cc8b7 100644 --- a/python/examples/pipeline/bert/benchmark.sh +++ b/python/examples/pipeline/bert/benchmark.sh @@ -1,31 +1,34 @@ +export FLAGS_profile_pipeline=1 alias python3="python3.7" +modelname="bert" # HTTP ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 sleep 3 python3 benchmark.py yaml local_predictor 1 gpu - +rm -rf profile_log_$modelname for thread_num in 1 do -for batch_size in 1 -do -rm -rf PipelineServingLogs -rm -rf cpu_utilization.py -python3 web_service.py >web.log 2>&1 & -sleep 3 -nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 & -nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 & -echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py -python3 benchmark.py run http $thread_num $batch_size -python3 cpu_utilization.py -echo "------------Fit a line pipeline benchmark (Thread: $thread_num) (BatchSize: $batch_size)" -tail -n 25 PipelineServingLogs/pipeline.tracer -awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$1 -awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$1 -#rm -rf gpu_use.log gpu_utilization.log -ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 -done + for batch_size in 20 + do + echo "----Bert thread num: $thread_num batch size: $batch_size mode:http ----" >>profile_log_$modelname + rm -rf PipelineServingLogs + rm -rf cpu_utilization.py + python3 web_service.py >web.log 2>&1 & + sleep 3 + nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 & + nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 & + echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py + python3 benchmark.py run http $thread_num $batch_size + python3 cpu_utilization.py >>profile_log_$modelname + ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 + python3 benchmark.py dump benchmark.log benchmark.tmp + mv benchmark.tmp benchmark.log + awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname + awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname + cat benchmark.log >> profile_log_$modelname + #rm -rf gpu_use.log gpu_utilization.log + done done - # RPC ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 sleep 3 @@ -33,22 +36,24 @@ python3 benchmark.py yaml local_predictor 1 gpu for thread_num in 1 do -for batch_size in 1 -do -rm -rf PipelineServingLogs -rm -rf cpu_utilization.py -python3 web_service.py >web.log 2>&1 & -sleep 3 -nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 & -nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 & -echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py -python3 benchmark.py run rpc $thread_num $batch_size -python3 cpu_utilization.py -echo "------------Fit a line pipeline benchmark (Thread: $thread_num) (BatchSize: $batch_size)" -tail -n 25 PipelineServingLogs/pipeline.tracer -awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$1 -awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$1 -#rm -rf gpu_use.log gpu_utilization.log -ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 -done + for batch_size in 20 + do + echo "----Bert thread num: $thread_num batch size: $batch_size mode:rpc ----" >>profile_log_$modelname + rm -rf PipelineServingLogs + rm -rf cpu_utilization.py + python3 web_service.py >web.log 2>&1 & + sleep 3 + nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 & + nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 & + echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py + python3 benchmark.py run rpc $thread_num $batch_size + python3 cpu_utilization.py >>profile_log_$modelname + ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 + python3 benchmark.py dump benchmark.log benchmark.tmp + mv benchmark.tmp benchmark.log + awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname + awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname + #rm -rf gpu_use.log gpu_utilization.log + cat benchmark.log >> profile_log_$modelname + done done diff --git a/python/examples/pipeline/ocr/benchmark.py b/python/examples/pipeline/ocr/benchmark.py new file mode 100644 index 00000000..af1caae4 --- /dev/null +++ b/python/examples/pipeline/ocr/benchmark.py @@ -0,0 +1,101 @@ +import sys +import os +import base64 +import yaml +import requests +import time +import json +try: + from paddle_serving_server_gpu.pipeline import PipelineClient +except ImportError: + from paddle_serving_server.pipeline import PipelineClient +import numpy as np +from paddle_serving_client.utils import MultiThreadRunner +from paddle_serving_client.utils import benchmark_args, show_latency +def parse_benchmark(filein, fileout): + with open(filein, "r") as fin: + res = yaml.load(fin) + del_list = [] + for key in res["DAG"].keys(): + if "call" in key: + del_list.append(key) + for key in del_list: + del res["DAG"][key] + with open(fileout, "w") as fout: + yaml.dump(res, fout, default_flow_style=False) + +def gen_yml(device): + fin = open("config.yml", "r") + config = yaml.load(fin) + fin.close() + config["dag"]["tracer"] = {"interval_s": 10} + if device == "gpu": + config["op"]["det"]["local_service_conf"]["device_type"] = 1 + config["op"]["det"]["local_service_conf"]["devices"] = "2" + config["op"]["rec"]["local_service_conf"]["device_type"] = 1 + config["op"]["rec"]["local_service_conf"]["devices"] = "2" + with open("config2.yml", "w") as fout: + yaml.dump(config, fout, default_flow_style=False) + +def cv2_to_base64(image): + return base64.b64encode(image).decode('utf8') + +def run_http(idx, batch_size): + print("start thread ({})".format(idx)) + url = "http://127.0.0.1:9999/ocr/prediction" + start = time.time() + + test_img_dir = "imgs/" + for img_file in os.listdir(test_img_dir): + with open(os.path.join(test_img_dir, img_file), 'rb') as file: + image_data1 = file.read() + image = cv2_to_base64(image_data1) + data = {"key": ["image"], "value": [image]} + for i in range(100): + r = requests.post(url=url, data=json.dumps(data)) + end = time.time() + return [[end - start]] + +def multithread_http(thread, batch_size): + multi_thread_runner = MultiThreadRunner() + result = multi_thread_runner.run(run_http , thread, batch_size) + +def run_rpc(thread, batch_size): + client = PipelineClient() + client.connect(['127.0.0.1:18090']) + start = time.time() + test_img_dir = "imgs/" + for img_file in os.listdir(test_img_dir): + with open(os.path.join(test_img_dir, img_file), 'rb') as file: + image_data = file.read() + image = cv2_to_base64(image_data) + + for i in range(100): + ret = client.predict(feed_dict={"image": image}, fetch=["res"]) + end = time.time() + return [[end - start]] + + +def multithread_rpc(thraed, batch_size): + multi_thread_runner = MultiThreadRunner() + result = multi_thread_runner.run(run_rpc , thread, batch_size) + +if __name__ == "__main__": + if sys.argv[1] == "yaml": + mode = sys.argv[2] # brpc/ local predictor + thread = int(sys.argv[3]) + device = sys.argv[4] + gen_yml(device) + elif sys.argv[1] == "run": + mode = sys.argv[2] # http/ rpc + thread = int(sys.argv[3]) + batch_size = int(sys.argv[4]) + if mode == "http": + multithread_http(thread, batch_size) + elif mode == "rpc": + multithread_rpc(thread, batch_size) + elif sys.argv[1] == "dump": + filein = sys.argv[2] + fileout = sys.argv[3] + parse_benchmark(filein, fileout) + diff --git a/python/examples/pipeline/ocr/benchmark.sh b/python/examples/pipeline/ocr/benchmark.sh new file mode 100644 index 00000000..5887b034 --- /dev/null +++ b/python/examples/pipeline/ocr/benchmark.sh @@ -0,0 +1,59 @@ +export FLAGS_profile_pipeline=1 +alias python3="python3.7" +modelname="ocr" +# HTTP +ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 +sleep 3 +python3 benchmark.py yaml local_predictor 1 gpu +rm -rf profile_log_$modelname +for thread_num in 1 +do + for batch_size in 1 + do + echo "----Bert thread num: $thread_num batch size: $batch_size mode:http ----" >>profile_log_$modelname + rm -rf PipelineServingLogs + rm -rf cpu_utilization.py + python3 web_service.py >web.log 2>&1 & + sleep 3 + nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 & + nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 & + echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py + python3 benchmark.py run http $thread_num $batch_size + python3 cpu_utilization.py >>profile_log_$modelname + ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 + python3 benchmark.py dump benchmark.log benchmark.tmp + mv benchmark.tmp benchmark.log + awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname + awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname + cat benchmark.log >> profile_log_$modelname + #rm -rf gpu_use.log gpu_utilization.log + done +done +# RPC +ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 +sleep 3 +python3 benchmark.py yaml local_predictor 1 gpu + +for thread_num in 1 +do + for batch_size in 1 + do + echo "----Bert thread num: $thread_num batch size: $batch_size mode:rpc ----" >>profile_log_$modelname + rm -rf PipelineServingLogs + rm -rf cpu_utilization.py + python3 web_service.py >web.log 2>&1 & + sleep 3 + nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 & + nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 & + echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py + python3 benchmark.py run rpc $thread_num $batch_size + python3 cpu_utilization.py >>profile_log_$modelname + ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 + python3 benchmark.py dump benchmark.log benchmark.tmp + mv benchmark.tmp benchmark.log + awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname + awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname + #rm -rf gpu_use.log gpu_utilization.log + cat benchmark.log >> profile_log_$modelname + done +done diff --git a/python/examples/pipeline/ocr/web_service.py b/python/examples/pipeline/ocr/web_service.py index 7e9dd314..e91f0fe5 100644 --- a/python/examples/pipeline/ocr/web_service.py +++ b/python/examples/pipeline/ocr/web_service.py @@ -45,16 +45,19 @@ class DetOp(Op): def preprocess(self, input_dicts, data_id, log_id): (_, input_dict), = input_dicts.items() - data = base64.b64decode(input_dict["image"].encode('utf8')) - data = np.fromstring(data, np.uint8) - # Note: class variables(self.var) can only be used in process op mode - self.im = cv2.imdecode(data, cv2.IMREAD_COLOR) - self.ori_h, self.ori_w, _ = self.im.shape - det_img = self.det_preprocess(self.im) - _, self.new_h, self.new_w = det_img.shape - return {"image": det_img[np.newaxis, :].copy()}, False, None, "" + imgs = [] + for key in input_dict.keys(): + data = base64.b64decode(input_dict[key].encode('utf8')) + data = np.fromstring(data, np.uint8) + self.im = cv2.imdecode(data, cv2.IMREAD_COLOR) + self.ori_h, self.ori_w, _ = self.im.shape + det_img = self.det_preprocess(self.im) + _, self.new_h, self.new_w = det_img.shape + imgs.append(det_img[np.newaxis, :].copy()) + return {"image": np.concatenate(imgs, axis=0)}, False, None, "" def postprocess(self, input_dicts, fetch_dict, log_id): +# print(fetch_dict) det_out = fetch_dict["concat_1.tmp_0"] ratio_list = [ float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w @@ -62,7 +65,6 @@ class DetOp(Op): dt_boxes_list = self.post_func(det_out, [ratio_list]) dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w]) out_dict = {"dt_boxes": dt_boxes, "image": self.im} - print("out dict", out_dict) return out_dict, None, "" @@ -112,5 +114,5 @@ class OcrService(WebService): uci_service = OcrService(name="ocr") -uci_service.prepare_pipeline_config("config.yml") +uci_service.prepare_pipeline_config("config2.yml") uci_service.run_service() diff --git a/python/pipeline/profiler.py b/python/pipeline/profiler.py index 18845a75..6e38cb31 100644 --- a/python/pipeline/profiler.py +++ b/python/pipeline/profiler.py @@ -26,10 +26,11 @@ from time import time as _time import time import threading import multiprocessing - +import copy _LOGGER = logging.getLogger(__name__) _LOGGER.propagate = False +_is_profile = int(os.environ.get('FLAGS_profile_pipeline', 0)) class PerformanceTracer(object): def __init__(self, is_thread_mode, interval_s, server_worker_num): @@ -48,6 +49,8 @@ class PerformanceTracer(object): self._channels = [] # The size of data in Channel will not exceed server_worker_num self._server_worker_num = server_worker_num + if _is_profile: + self.profile_dict = {} def data_buffer(self): return self._data_buffer @@ -82,7 +85,7 @@ class PerformanceTracer(object): item = self._data_buffer.get_nowait() name = item["name"] actions = item["actions"] - + if name == "DAG": succ = item["succ"] req_id = item["id"] @@ -106,9 +109,9 @@ class PerformanceTracer(object): for action, costs in op_cost[name].items(): op_cost[name][action] = sum(costs) / (1e3 * len(costs)) tot_cost += op_cost[name][action] - if name != "DAG": _LOGGER.info("Op({}):".format(name)) + for action in all_actions: if action in op_cost[name]: _LOGGER.info("\t{}[{} ms]".format( @@ -118,7 +121,9 @@ class PerformanceTracer(object): calcu_cost += op_cost[name][action] _LOGGER.info("\tidle[{}]".format(1 - 1.0 * calcu_cost / tot_cost)) - + if _is_profile: + self.profile_dict = copy.deepcopy(op_cost) + if "DAG" in op_cost: calls = list(op_cost["DAG"].values()) calls.sort() @@ -137,7 +142,17 @@ class PerformanceTracer(object): for latency in latencys: _LOGGER.info("\t\t.{}[{} ms]".format(latency, calls[int( tot * latency / 100.0)])) - + if _is_profile: + self.profile_dict["DAG"]["query_count"] = tot + self.profile_dict["DAG"]["qps"] = qps + self.profile_dict["DAG"]["succ"] = 1 - 1.0 * err_count / tot + self.profile_dict["DAG"]["avg"] = ave_cost + for latency in latencys: + self.profile_dict["DAG"][str(latency)] = calls[int(tot * latency / 100.0)] + if _is_profile: + import yaml + with open("benchmark.log", "w") as fout: + yaml.dump(self.profile_dict, fout, default_flow_style=False) # channel _LOGGER.info("Channel (server worker num[{}]):".format( self._server_worker_num)) -- GitLab