diff --git a/python/examples/bert/benchmark.py b/python/examples/bert/benchmark.py index af75b718b78b2bc130c2411d05d190fc0d298006..f1533d9710d3149a37818d3f1bc146fad6ce6537 100644 --- a/python/examples/bert/benchmark.py +++ b/python/examples/bert/benchmark.py @@ -21,11 +21,7 @@ import sys import time from paddle_serving_client import Client from paddle_serving_client.utils import MultiThreadRunner -from paddle_serving_client.utils import benchmark_args -from batching import pad_batch_data -import tokenization -import requests -import json +from paddle_serving_client.utils import benchmark_args, show_latency from paddle_serving_app.reader import ChineseBertReader args = benchmark_args() @@ -36,42 +32,75 @@ def single_func(idx, resource): dataset = [] for line in fin: dataset.append(line.strip()) + + profile_flags = False + latency_flags = False + if os.getenv("FLAGS_profile_client"): + profile_flags = True + if os.getenv("FLAGS_serving_latency"): + latency_flags = True + latency_list = [] + if args.request == "rpc": - reader = ChineseBertReader(vocab_file="vocab.txt", max_seq_len=20) + reader = ChineseBertReader({"max_seq_len": 128}) fetch = ["pooled_output"] client = Client() client.load_client_config(args.model) client.connect([resource["endpoint"][idx % len(resource["endpoint"])]]) - start = time.time() - for i in range(1000): - if args.batch_size == 1: - feed_dict = reader.process(dataset[i]) - result = client.predict(feed=feed_dict, fetch=fetch) + for i in range(turns): + if args.batch_size >= 1: + l_start = time.time() + feed_batch = [] + b_start = time.time() + for bi in range(args.batch_size): + feed_batch.append(reader.process(dataset[bi])) + b_end = time.time() + + if profile_flags: + sys.stderr.write( + "PROFILE\tpid:{}\tbert_pre_0:{} bert_pre_1:{}\n".format( + os.getpid(), + int(round(b_start * 1000000)), + int(round(b_end * 1000000)))) + result = client.predict(feed=feed_batch, fetch=fetch) + + l_end = time.time() + if latency_flags: + latency_list.append(l_end * 1000 - l_start * 1000) else: print("unsupport batch size {}".format(args.batch_size)) elif args.request == "http": - start = time.time() - header = {"Content-Type": "application/json"} - for i in range(1000): - dict_data = {"words": dataset[i], "fetch": ["pooled_output"]} - r = requests.post( - 'http://{}/bert/prediction'.format(resource["endpoint"][ - idx % len(resource["endpoint"])]), - data=json.dumps(dict_data), - headers=header) + raise ("not implemented") end = time.time() - return [[end - start]] + if latency_flags: + return [[end - start], latency_list] + else: + return [[end - start]] if __name__ == '__main__': multi_thread_runner = MultiThreadRunner() - endpoint_list = ["127.0.0.1:9292"] - result = multi_thread_runner.run(single_func, args.thread, - {"endpoint": endpoint_list}) + endpoint_list = [ + "127.0.0.1:9292", "127.0.0.1:9293", "127.0.0.1:9294", "127.0.0.1:9295" + ] + turns = 10 + start = time.time() + result = multi_thread_runner.run( + single_func, args.thread, {"endpoint": endpoint_list, + "turns": turns}) + end = time.time() + total_cost = end - start + avg_cost = 0 for i in range(args.thread): avg_cost += result[0][i] avg_cost = avg_cost / args.thread - print("average total cost {} s.".format(avg_cost)) + + print("total cost :{} s".format(total_cost)) + print("each thread cost :{} s. ".format(avg_cost)) + print("qps :{} samples/s".format(args.batch_size * args.thread * turns / + total_cost)) + if os.getenv("FLAGS_serving_latency"): + show_latency(result[1]) diff --git a/python/examples/bert/benchmark.sh b/python/examples/bert/benchmark.sh index 7f9e2325f3b8f7db288d2b7d82d0d412e05417cb..7ee5f32e9e5d89a836f8962a256bcdf7bf0b62e2 100644 --- a/python/examples/bert/benchmark.sh +++ b/python/examples/bert/benchmark.sh @@ -1,9 +1,30 @@ rm profile_log -for thread_num in 1 2 4 8 16 +export CUDA_VISIBLE_DEVICES=0,1,2,3 +export FLAGS_profile_server=1 +export FLAGS_profile_client=1 +export FLAGS_serving_latency=1 +python3 -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim False --ir_optim True 2> elog > stdlog & + +sleep 5 + +#warm up +python3 benchmark.py --thread 8 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1 + +for thread_num in 4 8 16 do - $PYTHONROOT/bin/python benchmark.py --thread $thread_num --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1 - echo "========================================" - echo "batch size : $batch_size" >> profile_log - $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log - tail -n 1 profile >> profile_log +for batch_size in 1 4 16 64 256 +do + python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1 + echo "model name :" $1 + echo "thread num :" $thread_num + echo "batch size :" $batch_size + echo "=================Done====================" + echo "model name :$1" >> profile_log_$1 + echo "batch size :$batch_size" >> profile_log_$1 + python3 ../util/show_profile.py profile $thread_num >> profile_log_$1 + tail -n 8 profile >> profile_log_$1 + echo "" >> profile_log_$1 +done done + +ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9 diff --git a/python/examples/bert/benchmark_batch.py b/python/examples/bert/benchmark_batch.py deleted file mode 100644 index 7cedb6aa451e0e4a128f0fedbfde1a896977f601..0000000000000000000000000000000000000000 --- a/python/examples/bert/benchmark_batch.py +++ /dev/null @@ -1,79 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# pylint: disable=doc-string-missing - -from __future__ import unicode_literals, absolute_import -import os -import sys -import time -from paddle_serving_client import Client -from paddle_serving_client.utils import MultiThreadRunner -from paddle_serving_client.utils import benchmark_args -from batching import pad_batch_data -import tokenization -import requests -import json -from bert_reader import BertReader -args = benchmark_args() - - -def single_func(idx, resource): - fin = open("data-c.txt") - dataset = [] - for line in fin: - dataset.append(line.strip()) - profile_flags = False - if os.environ["FLAGS_profile_client"]: - profile_flags = True - if args.request == "rpc": - reader = BertReader(vocab_file="vocab.txt", max_seq_len=20) - fetch = ["pooled_output"] - client = Client() - client.load_client_config(args.model) - client.connect([resource["endpoint"][idx % len(resource["endpoint"])]]) - start = time.time() - for i in range(1000): - if args.batch_size >= 1: - feed_batch = [] - b_start = time.time() - for bi in range(args.batch_size): - feed_batch.append(reader.process(dataset[bi])) - b_end = time.time() - if profile_flags: - print("PROFILE\tpid:{}\tbert_pre_0:{} bert_pre_1:{}".format( - os.getpid(), - int(round(b_start * 1000000)), - int(round(b_end * 1000000)))) - result = client.predict(feed=feed_batch, fetch=fetch) - else: - print("unsupport batch size {}".format(args.batch_size)) - - elif args.request == "http": - raise ("no batch predict for http") - end = time.time() - return [[end - start]] - - -if __name__ == '__main__': - multi_thread_runner = MultiThreadRunner() - endpoint_list = ["127.0.0.1:9292"] - result = multi_thread_runner.run(single_func, args.thread, - {"endpoint": endpoint_list}) - avg_cost = 0 - for i in range(args.thread): - avg_cost += result[0][i] - avg_cost = avg_cost / args.thread - print("average total cost {} s.".format(avg_cost)) diff --git a/python/examples/bert/benchmark_batch.sh b/python/examples/bert/benchmark_batch.sh deleted file mode 100644 index 272923776d6640880175745920a8fad9e84972fd..0000000000000000000000000000000000000000 --- a/python/examples/bert/benchmark_batch.sh +++ /dev/null @@ -1,19 +0,0 @@ -rm profile_log -export CUDA_VISIBLE_DEVICES=0,1,2,3 -python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog & - -sleep 5 - -for thread_num in 1 2 4 8 16 -do -for batch_size in 1 2 4 8 16 32 64 128 256 512 -do - $PYTHONROOT/bin/python benchmark_batch.py --thread $thread_num --batch_size $batch_size --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1 - echo "========================================" - echo "thread num: ", $thread_num - echo "batch size: ", $batch_size - echo "batch size : $batch_size" >> profile_log - $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log - tail -n 1 profile >> profile_log -done -done diff --git a/python/examples/criteo_ctr_with_cube/cube_prepare.sh b/python/examples/criteo_ctr_with_cube/cube_prepare.sh index 2d0efaa56f06e9ad8d1590f1316e64bcc65f268d..1417254a54e2194ab3a0194f2ec970f480787acd 100755 --- a/python/examples/criteo_ctr_with_cube/cube_prepare.sh +++ b/python/examples/criteo_ctr_with_cube/cube_prepare.sh @@ -17,6 +17,6 @@ mkdir -p cube_model mkdir -p cube/data ./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature -./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1 -only_build=false +./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1 -only_build=false mv ./cube/data/0_0/test_dict_part0/* ./cube/data/ cd cube && ./cube diff --git a/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh b/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh index 7c794e103baa3a97d09966c470dd48eb56579500..0db6575ab307fb81cdd0336a20bb9a8ec30d446d 100755 --- a/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh +++ b/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh @@ -17,6 +17,6 @@ mkdir -p cube_model mkdir -p cube/data ./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature 8 -./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1 -only_build=false +./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1 -only_build=false mv ./cube/data/0_0/test_dict_part0/* ./cube/data/ cd cube && ./cube diff --git a/python/examples/imagenet/benchmark.py b/python/examples/imagenet/benchmark.py index caa952f121fbd8725c2a6bfe36f0dd84b6a82707..ac7ba8c333d25fb23bfc7695105315bfaa4e76ee 100644 --- a/python/examples/imagenet/benchmark.py +++ b/python/examples/imagenet/benchmark.py @@ -93,7 +93,7 @@ def single_func(idx, resource): if __name__ == '__main__': multi_thread_runner = MultiThreadRunner() - endpoint_list = ["127.0.0.1:9696"] + endpoint_list = ["127.0.0.1:9393"] #endpoint_list = endpoint_list + endpoint_list + endpoint_list result = multi_thread_runner.run(single_func, args.thread, {"endpoint": endpoint_list}) diff --git a/python/examples/imagenet/benchmark.sh b/python/examples/imagenet/benchmark.sh index 618a62c063c0bc4955baf8516bc5bc93e4832394..84885908fa89d050b3ca71386fe2a21533ce0809 100644 --- a/python/examples/imagenet/benchmark.sh +++ b/python/examples/imagenet/benchmark.sh @@ -1,12 +1,28 @@ rm profile_log -for thread_num in 1 2 4 8 +export CUDA_VISIBLE_DEVICES=0,1,2,3 +export FLAGS_profile_server=1 +export FLAGS_profile_client=1 +python -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog & + +sleep 5 + +#warm up +$PYTHONROOT/bin/python benchmark.py --thread 8 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1 + +for thread_num in 4 8 16 do -for batch_size in 1 2 4 8 16 32 64 128 +for batch_size in 1 4 16 64 256 do - $PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model ResNet50_vd_client_config/serving_client_conf.prototxt --request rpc > profile 2>&1 - echo "========================================" - echo "batch size : $batch_size" >> profile_log + $PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1 + echo "model name :" $1 + echo "thread num :" $thread_num + echo "batch size :" $batch_size + echo "=================Done====================" + echo "model name :$1" >> profile_log + echo "batch size :$batch_size" >> profile_log $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log - tail -n 1 profile >> profile_log + tail -n 8 profile >> profile_log done done + +ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9 diff --git a/python/examples/util/show_profile.py b/python/examples/util/show_profile.py index 9153d939338f0ee171af539b9f955d51802ad547..1581dda19bb0abefe6eb21592bda7fc97d8fb7cd 100644 --- a/python/examples/util/show_profile.py +++ b/python/examples/util/show_profile.py @@ -31,7 +31,7 @@ with open(profile_file) as f: if line[0] == "PROFILE": prase(line[2]) -print("thread num {}".format(thread_num)) +print("thread num :{}".format(thread_num)) for name in time_dict: - print("{} cost {} s in each thread ".format(name, time_dict[name] / ( + print("{} cost :{} s in each thread ".format(name, time_dict[name] / ( 1000000.0 * float(thread_num)))) diff --git a/python/paddle_serving_client/utils/__init__.py b/python/paddle_serving_client/utils/__init__.py index 381da6bf9bade2bb0627f4c07851012360905de5..53f40726fbf21a0607b47bb29a20aa6ff50b6221 100644 --- a/python/paddle_serving_client/utils/__init__.py +++ b/python/paddle_serving_client/utils/__init__.py @@ -17,6 +17,7 @@ import sys import subprocess import argparse from multiprocessing import Pool +import numpy as np def benchmark_args(): @@ -35,6 +36,17 @@ def benchmark_args(): return parser.parse_args() +def show_latency(latency_list): + latency_array = np.array(latency_list) + info = "latency:\n" + info += "mean :{} ms\n".format(np.mean(latency_array)) + info += "median :{} ms\n".format(np.median(latency_array)) + info += "80 percent :{} ms\n".format(np.percentile(latency_array, 80)) + info += "90 percent :{} ms\n".format(np.percentile(latency_array, 90)) + info += "99 percent :{} ms\n".format(np.percentile(latency_array, 99)) + sys.stderr.write(info) + + class MultiThreadRunner(object): def __init__(self): pass