diff --git a/python/examples/bert/benchmark.py b/python/examples/bert/benchmark.py index ff835ab844fe0bdebcf6e0b760a7579037c8d5af..639b717ca7a9d8fcd2a767437dc6a93c26125ecd 100644 --- a/python/examples/bert/benchmark.py +++ b/python/examples/bert/benchmark.py @@ -21,7 +21,7 @@ import sys import time from paddle_serving_client import Client from paddle_serving_client.utils import MultiThreadRunner -from paddle_serving_client.utils import benchmark_args +from paddle_serving_client.utils import benchmark_args, show_latency from batching import pad_batch_data import tokenization import requests @@ -35,11 +35,18 @@ def single_func(idx, resource): dataset = [] for line in fin: dataset.append(line.strip()) + profile_flags = False + latency_flags = False if os.getenv("FLAGS_profile_client"): profile_flags = True + if os.getenv("FLAGS_serving_latency"): + latency_flags = True + latency_list = [] + if args.request == "rpc": reader = BertReader(vocab_file="vocab.txt", max_seq_len=20) + fetch = ["pooled_output"] client = Client() client.load_client_config(args.model) @@ -47,11 +54,13 @@ def single_func(idx, resource): start = time.time() for i in range(turns): if args.batch_size >= 1: + l_start = time.time() feed_batch = [] b_start = time.time() for bi in range(args.batch_size): feed_batch.append(reader.process(dataset[bi])) b_end = time.time() + if profile_flags: sys.stderr.write( "PROFILE\tpid:{}\tbert_pre_0:{} bert_pre_1:{}\n".format( @@ -59,13 +68,17 @@ def single_func(idx, resource): int(round(b_start * 1000000)), int(round(b_end * 1000000)))) result = client.predict(feed=feed_batch, fetch=fetch) + + l_end = time.time() + if latency_flags: + latency_list.append(l_end * 1000 - l_start * 1000) else: print("unsupport batch size {}".format(args.batch_size)) elif args.request == "http": raise ("not implemented") end = time.time() - return [[end - start]] + return [[end - start], latency_list] if __name__ == '__main__': @@ -78,13 +91,17 @@ if __name__ == '__main__': result = multi_thread_runner.run( single_func, args.thread, {"endpoint": endpoint_list, "turns": turns}) + end = time.time() + total_cost = end - start + avg_cost = 0 for i in range(args.thread): avg_cost += result[0][i] avg_cost = avg_cost / args.thread - end = time.time() - total_cost = end - start + print("total cost :{} s".format(total_cost)) print("each thread cost :{} s. ".format(avg_cost)) print("qps :{} samples/s".format(args.batch_size * args.thread * turns / total_cost)) + if os.getenv("FLAGS_serving_latency"): + show_latency(result[1]) diff --git a/python/examples/bert/benchmark.sh b/python/examples/bert/benchmark.sh index fef2a73664866316b37603a557f6ad3b34705548..2965b6f3886092a192a42048d65e9be2983a9864 100644 --- a/python/examples/bert/benchmark.sh +++ b/python/examples/bert/benchmark.sh @@ -2,24 +2,28 @@ rm profile_log export CUDA_VISIBLE_DEVICES=0,1,2,3 export FLAGS_profile_server=1 export FLAGS_profile_client=1 -python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9292 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog & +export FLAGS_serving_latency=1 +python -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog & sleep 5 #warm up -$PYTHONROOT/bin/python benchmark.py --thread 8 --batch_size 1 --model ./bert_seq20_client/serving_client_conf.prototxt --request rpc > profile 2>&1 +$PYTHONROOT/bin/python benchmark.py --thread 8 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1 -for thread_num in 8 16 32 +for thread_num in 4 8 16 do for batch_size in 1 4 16 64 256 do - $PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model ./bert_seq20_client/serving_client_conf.prototxt --request rpc > profile 2>&1 + $PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1 + echo "model name :" $1 echo "thread num :" $thread_num echo "batch size :" $batch_size echo "=================Done====================" + echo "model name :$1" >> profile_log echo "batch size :$batch_size" >> profile_log $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log - tail -n 3 profile >> profile_log + tail -n 8 profile >> profile_log + echo "" >> profile_log done done diff --git a/python/examples/bert/prepare_model.py b/python/examples/bert/prepare_model.py index 70902adf9268d1071c79eb27216dcc2ea9a11a49..674947b9c0966b142da93f56e6f9b6ab61098a62 100644 --- a/python/examples/bert/prepare_model.py +++ b/python/examples/bert/prepare_model.py @@ -17,10 +17,11 @@ import paddle.fluid as fluid import sys import paddle_serving_client.io as serving_io -model_name = "bert_chinese_L-12_H-768_A-12" +#model_name = "bert_chinese_L-12_H-768_A-12" +model_name = sys.argv[1] module = hub.Module(model_name) inputs, outputs, program = module.context( - trainable=True, max_seq_len=int(sys.argv[1])) + trainable=True, max_seq_len=int(sys.argv[2])) place = fluid.core_avx.CPUPlace() exe = fluid.Executor(place) input_ids = inputs["input_ids"] @@ -37,8 +38,8 @@ feed_var_names = [ target_vars = [pooled_output, sequence_output] serving_io.save_model( - "bert_seq{}_model".format(sys.argv[1]), - "bert_seq{}_client".format(sys.argv[1]), { + "{}_seq{}_model".format(sys.argv[1], sys.argv[2]), + "{}_seq{}_client".format(sys.argv[1], sys.argv[2]), { "input_ids": input_ids, "position_ids": position_ids, "segment_ids": segment_ids, diff --git a/python/paddle_serving_client/utils/__init__.py b/python/paddle_serving_client/utils/__init__.py index 381da6bf9bade2bb0627f4c07851012360905de5..e1d8fe93147cbe891a9a91228ff30a81f248934d 100644 --- a/python/paddle_serving_client/utils/__init__.py +++ b/python/paddle_serving_client/utils/__init__.py @@ -17,6 +17,7 @@ import sys import subprocess import argparse from multiprocessing import Pool +import numpy as np def benchmark_args(): @@ -35,6 +36,17 @@ def benchmark_args(): return parser.parse_args() +def show_latency(latency_list): + latency_array = np.array(latency_list) + info = "" + info += "mean :{} ms\n".format(np.mean(latency_array)) + info += "median :{} ms\n".format(np.median(latency_array)) + info += "80 percent :{} ms\n".format(np.percentile(latency_array, 80)) + info += "90 percent :{} ms\n".format(np.percentile(latency_array, 90)) + info += "99 percent :{} ms\n".format(np.percentile(latency_array, 99)) + sys.stderr.write(info) + + class MultiThreadRunner(object): def __init__(self): pass diff --git a/tools/serving_ce.sh b/tools/serving_ce.sh new file mode 100644 index 0000000000000000000000000000000000000000..ad70f0f60fc8257828eaee2c5d751b36820d3105 --- /dev/null +++ b/tools/serving_ce.sh @@ -0,0 +1,25 @@ +set -x +set -v +function get_model(){ +if [ ! -d "bert_cased_L-12_H-768_A-12_model" ]; then + python -c "from paddle_serving_app.models import ServingModels; models = ServingModels();\ + models.download(\"$1\")" +tar -xzf $1.tar.gz +fi +} + +function bert_demo(){ +cd ../python/examples/bert +python prepare_model.py bert_chinese_L-12_H-768_A-12 20 +sh benchmark.sh bert_chinese_L-12_H-768_A-12_seq20_model bert_chinese_L-12_H-768_A-12_seq20_client + +python prepare_model.py ernie_tiny 20 +sh benchmark.sh ernie_tiny_seq20_model ernie_tiny_seq20_client +cd - +} + +function imagenet_demo(){ +cd ../python/examples/imagenet +sh get_model.sh + +}