fix imagenet benchmark

2eac9709 · MRXLT · 689fadca · 2eac9709 · 2eac9709 · 2eac9709
4 changed file
--- a/python/examples/bert/benchmark.sh
+++ b/python/examples/bert/benchmark.sh
@@ -3,7 +3,7 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3
 export FLAGS_profile_server=1
 export FLAGS_profile_client=1
 export FLAGS_serving_latency=1
-python3 -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim False --ir_optim True 2> elog > stdlog &
+python3 -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim --ir_optim 2> elog > stdlog &
 hostname=`echo $(hostname)|awk -F '.baidu.com' '{print $1}'`
 sleep 5
 gpu_id=0

--- a/python/examples/imagenet/benchmark.py
+++ b/python/examples/imagenet/benchmark.py
@@ -24,7 +24,7 @@ import json
 import base64
 from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
-from paddle_serving_client.utils import benchmark_args
+from paddle_serving_client.utils import benchmark_args, show_latency
 from paddle_serving_app.reader import Sequential, File2Image, Resize
 from paddle_serving_app.reader import CenterCrop, RGB2BGR, Transpose, Div, Normalize
@@ -38,7 +38,11 @@ seq_preprocess = Sequential([
 def single_func(idx, resource):
    file_list = []
-    turns = 10
+    turns = resource["turns"]
+    latency_flags = False
+    if os.getenv("FLAGS_serving_latency"):
+        latency_flags = True
+        latency_list = []
    for file_name in os.listdir("./image_data/n01440764"):
        file_list.append(file_name)
    img_list = []
@@ -56,6 +60,7 @@ def single_func(idx, resource):
        start = time.time()
        for i in range(turns):
            if args.batch_size >= 1:
+                l_start = time.time()
                feed_batch = []
                i_start = time.time()
                for bi in range(args.batch_size):
@@ -69,6 +74,9 @@ def single_func(idx, resource):
                                 int(round(i_end * 1000000))))
                result = client.predict(feed=feed_batch, fetch=fetch)
+                l_end = time.time()
+                if latency_flags:
+                    latency_list.append(l_end * 1000 - l_start * 1000)
            else:
                print("unsupport batch size {}".format(args.batch_size))
@@ -88,6 +96,8 @@ def single_func(idx, resource):
            r = requests.post(
                server, data=req, headers={"Content-Type": "application/json"})
    end = time.time()
+    if latency_flags:
+        return [[end - start], latency_list]
    return [[end - start]]
@@ -96,11 +106,21 @@ if __name__ == '__main__':
    endpoint_list = [
        "127.0.0.1:9292", "127.0.0.1:9293", "127.0.0.1:9294", "127.0.0.1:9295"
    ]
-    result = multi_thread_runner.run(single_func, args.thread,
+    turns = 100
-                                     {"endpoint": endpoint_list})
+    start = time.time()
+    result = multi_thread_runner.run(
+        single_func, args.thread, {"endpoint": endpoint_list,
+                                   "turns": turns})
    #result = single_func(0, {"endpoint": endpoint_list})
+    end = time.time()
+    total_cost = end - start
    avg_cost = 0
    for i in range(args.thread):
        avg_cost += result[0][i]
    avg_cost = avg_cost / args.thread
-    print("average total cost {} s.".format(avg_cost))
+    print("total cost: {}s".format(end - start))
+    print("each thread cost: {}s.".format(avg_cost))
+    print("qps: {}samples/s".format(args.batch_size * args.thread * turns /
+                                    total_cost))
+    if os.getenv("FLAGS_serving_latency"):
+        show_latency(result[1])
--- a/python/examples/imagenet/benchmark.sh
+++ b/python/examples/imagenet/benchmark.sh
@@ -2,14 +2,14 @@ rm profile_log
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 export FLAGS_profile_server=1
 export FLAGS_profile_client=1
-python -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
+python -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim --ir_optim  2> elog > stdlog &
 sleep 5
 #warm up
 $PYTHONROOT/bin/python benchmark.py --thread 8 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
-for thread_num in 4 8 16
+for thread_num in 1 4 8 16
 do
 for batch_size in 1 4 16 64
 do

--- a/python/examples/imdb/benchmark.sh
+++ b/python/examples/imdb/benchmark.sh
@@ -8,9 +8,9 @@ hostname=`echo $(hostname)|awk -F '.baidu.com' '{print $1}'`
 sleep 5
-for thread_num in 4 8 16
+for thread_num in 1 4 8 16
 do
-for batch_size in 1 4 16 64 256
+for batch_size in 1 4 16 64
 do
    job_bt=`date '+%Y%m%d%H%M%S'`
    python benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1