diff --git a/python/examples/bert/benchmark_batch.py b/python/examples/bert/benchmark_batch.py index b4d13c7db6b3c32c7e8ccd75c33ce25a196e0ea8..e0f677146a47c0366a1bbafe9eff049e2671a617 100644 --- a/python/examples/bert/benchmark_batch.py +++ b/python/examples/bert/benchmark_batch.py @@ -41,13 +41,13 @@ def single_func(idx, resource): client = Client() client.load_client_config(args.model) client.connect([resource["endpoint"][idx % len(resource["endpoint"])]]) + feed_batch = [] + for bi in range(args.batch_size): + feed_batch.append(reader.process(dataset[bi])) start = time.time() for i in range(1000): if args.batch_size >= 1: - feed_batch = [] - for bi in range(args.batch_size): - feed_batch.append(reader.process(dataset[i])) result = client.batch_predict( feed_batch=feed_batch, fetch=fetch) else: @@ -61,7 +61,9 @@ def single_func(idx, resource): if __name__ == '__main__': multi_thread_runner = MultiThreadRunner() - endpoint_list = ["127.0.0.1:9292"] + endpoint_list = [ + "127.0.0.1:9295", "127.0.0.1:9296", "127.0.0.1:9297", "127.0.0.1:9298" + ] result = multi_thread_runner.run(single_func, args.thread, {"endpoint": endpoint_list}) avg_cost = 0 diff --git a/python/examples/bert/benchmark_batch.sh b/python/examples/bert/benchmark_batch.sh index 46ba451d0ade36c24151e260d5c9b3cc3666a548..272923776d6640880175745920a8fad9e84972fd 100644 --- a/python/examples/bert/benchmark_batch.sh +++ b/python/examples/bert/benchmark_batch.sh @@ -1,10 +1,17 @@ rm profile_log +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog & + +sleep 5 + for thread_num in 1 2 4 8 16 do for batch_size in 1 2 4 8 16 32 64 128 256 512 do $PYTHONROOT/bin/python benchmark_batch.py --thread $thread_num --batch_size $batch_size --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1 echo "========================================" + echo "thread num: ", $thread_num + echo "batch size: ", $batch_size echo "batch size : $batch_size" >> profile_log $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log tail -n 1 profile >> profile_log diff --git a/python/examples/bert/benchmark_with_profile.sh b/python/examples/bert/benchmark_with_profile.sh new file mode 100644 index 0000000000000000000000000000000000000000..8102e30d5c794d5e21d34e2f4ffd88a1af791b5e --- /dev/null +++ b/python/examples/bert/benchmark_with_profile.sh @@ -0,0 +1,10 @@ +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog & +export FLAGS_profile_client=1 +export FLAGS_profile_server=1 +sleep 5 +thread_num=4 +python benchmark_batch.py --thread ${thread_num} --batch_size 64 --model serving_client_conf/serving_client_conf.prototxt 2> profile + +python show_profile.py profile ${thread_num} +python timeline_trace.py profile trace