Merge remote-tracking branch 'upstream/develop' into 0.3.0-cherry

217d5d17 · MRXLT · 07758628 · 4470bb08 · 217d5d17 · 217d5d17
10 changed file
--- a/python/examples/bert/benchmark.py
+++ b/python/examples/bert/benchmark.py
@@ -21,11 +21,7 @@ import sys
 import time
 from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
-from paddle_serving_client.utils import benchmark_args
+from paddle_serving_client.utils import benchmark_args, show_latency
-from batching import pad_batch_data
-import tokenization
-import requests
-import json
 from paddle_serving_app.reader import ChineseBertReader
 args = benchmark_args()
@@ -36,42 +32,75 @@ def single_func(idx, resource):
    dataset = []
    for line in fin:
        dataset.append(line.strip())
+    profile_flags = False
+    latency_flags = False
+    if os.getenv("FLAGS_profile_client"):
+        profile_flags = True
+    if os.getenv("FLAGS_serving_latency"):
+        latency_flags = True
+        latency_list = []
    if args.request == "rpc":
-        reader = ChineseBertReader(vocab_file="vocab.txt", max_seq_len=20)
+        reader = ChineseBertReader({"max_seq_len": 128})
        fetch = ["pooled_output"]
        client = Client()
        client.load_client_config(args.model)
        client.connect([resource["endpoint"][idx % len(resource["endpoint"])]])
        start = time.time()
-        for i in range(1000):
+        for i in range(turns):
-            if args.batch_size == 1:
+            if args.batch_size >= 1:
-                feed_dict = reader.process(dataset[i])
+                l_start = time.time()
-                result = client.predict(feed=feed_dict, fetch=fetch)
+                feed_batch = []
+                b_start = time.time()
+                for bi in range(args.batch_size):
+                    feed_batch.append(reader.process(dataset[bi]))
+                b_end = time.time()
+                if profile_flags:
+                    sys.stderr.write(
+                        "PROFILE\tpid:{}\tbert_pre_0:{} bert_pre_1:{}\n".format(
+                            os.getpid(),
+                            int(round(b_start * 1000000)),
+                            int(round(b_end * 1000000))))
+                result = client.predict(feed=feed_batch, fetch=fetch)
+                l_end = time.time()
+                if latency_flags:
+                    latency_list.append(l_end * 1000 - l_start * 1000)
            else:
                print("unsupport batch size {}".format(args.batch_size))
    elif args.request == "http":
-        start = time.time()
+        raise ("not implemented")
-        header = {"Content-Type": "application/json"}
-        for i in range(1000):
-            dict_data = {"words": dataset[i], "fetch": ["pooled_output"]}
-            r = requests.post(
-                'http://{}/bert/prediction'.format(resource["endpoint"][
-                    idx % len(resource["endpoint"])]),
-                data=json.dumps(dict_data),
-                headers=header)
    end = time.time()
-    return [[end - start]]
+    if latency_flags:
+        return [[end - start], latency_list]
+    else:
+        return [[end - start]]
 if __name__ == '__main__':
    multi_thread_runner = MultiThreadRunner()
-    endpoint_list = ["127.0.0.1:9292"]
+    endpoint_list = [
-    result = multi_thread_runner.run(single_func, args.thread,
+        "127.0.0.1:9292", "127.0.0.1:9293", "127.0.0.1:9294", "127.0.0.1:9295"
-                                     {"endpoint": endpoint_list})
+    ]
+    turns = 10
+    start = time.time()
+    result = multi_thread_runner.run(
+        single_func, args.thread, {"endpoint": endpoint_list,
+                                   "turns": turns})
+    end = time.time()
+    total_cost = end - start
    avg_cost = 0
    for i in range(args.thread):
        avg_cost += result[0][i]
    avg_cost = avg_cost / args.thread
-    print("average total cost {} s.".format(avg_cost))
+    print("total cost :{} s".format(total_cost))
+    print("each thread cost :{} s. ".format(avg_cost))
+    print("qps :{} samples/s".format(args.batch_size * args.thread * turns /
+                                     total_cost))
+    if os.getenv("FLAGS_serving_latency"):
+        show_latency(result[1])
--- a/python/examples/bert/benchmark.sh
+++ b/python/examples/bert/benchmark.sh
 rm profile_log
-for thread_num in 1 2 4 8 16
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export FLAGS_profile_server=1
+export FLAGS_profile_client=1
+export FLAGS_serving_latency=1
+python3 -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim False --ir_optim True 2> elog > stdlog &
+sleep 5
+#warm up
+python3 benchmark.py --thread 8 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+for thread_num in 4 8 16
 do
-    $PYTHONROOT/bin/python benchmark.py --thread $thread_num --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1
+for batch_size in 1 4 16 64 256
-    echo "========================================"
+do
-    echo "batch size : $batch_size" >> profile_log
+    python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
-    $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
+    echo "model name :" $1
-    tail -n 1 profile >> profile_log
+    echo "thread num :" $thread_num
+    echo "batch size :" $batch_size
+    echo "=================Done===================="
+    echo "model name :$1" >> profile_log_$1
+    echo "batch size :$batch_size" >> profile_log_$1
+    python3 ../util/show_profile.py profile $thread_num >> profile_log_$1
+    tail -n 8 profile >> profile_log_$1
+    echo "" >> profile_log_$1
+done
 done
+ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/bert/benchmark_batch.py
+++ b/python/examples/bert/benchmark_batch.py
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from __future__ import unicode_literals, absolute_import
-import os
-import sys
-import time
-from paddle_serving_client import Client
-from paddle_serving_client.utils import MultiThreadRunner
-from paddle_serving_client.utils import benchmark_args
-from batching import pad_batch_data
-import tokenization
-import requests
-import json
-from bert_reader import BertReader
-args = benchmark_args()
-def single_func(idx, resource):
-    fin = open("data-c.txt")
-    dataset = []
-    for line in fin:
-        dataset.append(line.strip())
-    profile_flags = False
-    if os.environ["FLAGS_profile_client"]:
-        profile_flags = True
-    if args.request == "rpc":
-        reader = BertReader(vocab_file="vocab.txt", max_seq_len=20)
-        fetch = ["pooled_output"]
-        client = Client()
-        client.load_client_config(args.model)
-        client.connect([resource["endpoint"][idx % len(resource["endpoint"])]])
-        start = time.time()
-        for i in range(1000):
-            if args.batch_size >= 1:
-                feed_batch = []
-                b_start = time.time()
-                for bi in range(args.batch_size):
-                    feed_batch.append(reader.process(dataset[bi]))
-                b_end = time.time()
-                if profile_flags:
-                    print("PROFILE\tpid:{}\tbert_pre_0:{} bert_pre_1:{}".format(
-                        os.getpid(),
-                        int(round(b_start * 1000000)),
-                        int(round(b_end * 1000000))))
-                result = client.predict(feed=feed_batch, fetch=fetch)
-            else:
-                print("unsupport batch size {}".format(args.batch_size))
-    elif args.request == "http":
-        raise ("no batch predict for http")
-    end = time.time()
-    return [[end - start]]
-if __name__ == '__main__':
-    multi_thread_runner = MultiThreadRunner()
-    endpoint_list = ["127.0.0.1:9292"]
-    result = multi_thread_runner.run(single_func, args.thread,
-                                     {"endpoint": endpoint_list})
-    avg_cost = 0
-    for i in range(args.thread):
-        avg_cost += result[0][i]
-    avg_cost = avg_cost / args.thread
-    print("average total cost {} s.".format(avg_cost))
--- a/python/examples/bert/benchmark_batch.sh
+++ b/python/examples/bert/benchmark_batch.sh
-rm profile_log
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
-sleep 5
-for thread_num in 1 2 4 8 16
-do
-for batch_size in 1 2 4 8 16 32 64 128 256 512
-do
-    $PYTHONROOT/bin/python benchmark_batch.py --thread $thread_num --batch_size $batch_size --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1
-    echo "========================================"
-    echo "thread num: ", $thread_num
-    echo "batch size: ", $batch_size
-    echo "batch size : $batch_size" >> profile_log
-    $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
-    tail -n 1 profile >> profile_log
-done
-done
--- a/python/examples/criteo_ctr_with_cube/cube_prepare.sh
+++ b/python/examples/criteo_ctr_with_cube/cube_prepare.sh
@@ -17,6 +17,6 @@
 mkdir -p cube_model
 mkdir -p cube/data
 ./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature  
-./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1  -only_build=false
+./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1  -only_build=false
 mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
 cd cube && ./cube 
--- a/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh
+++ b/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh
@@ -17,6 +17,6 @@
 mkdir -p cube_model
 mkdir -p cube/data
 ./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature 8  
-./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1  -only_build=false
+./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1  -only_build=false
 mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
 cd cube && ./cube 
--- a/python/examples/imagenet/benchmark.py
+++ b/python/examples/imagenet/benchmark.py
@@ -93,7 +93,7 @@ def single_func(idx, resource):
 if __name__ == '__main__':
    multi_thread_runner = MultiThreadRunner()
-    endpoint_list = ["127.0.0.1:9696"]
+    endpoint_list = ["127.0.0.1:9393"]
    #endpoint_list = endpoint_list + endpoint_list + endpoint_list
    result = multi_thread_runner.run(single_func, args.thread,
                                     {"endpoint": endpoint_list})

--- a/python/examples/imagenet/benchmark.sh
+++ b/python/examples/imagenet/benchmark.sh
 rm profile_log
-for thread_num in 1 2 4 8
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export FLAGS_profile_server=1
+export FLAGS_profile_client=1
+python -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
+sleep 5
+#warm up
+$PYTHONROOT/bin/python benchmark.py --thread 8 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+for thread_num in 4 8 16
 do
-for batch_size in 1 2 4 8 16 32 64 128
+for batch_size in 1 4 16 64 256
 do
-    $PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model ResNet50_vd_client_config/serving_client_conf.prototxt --request rpc > profile 2>&1
+    $PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
-    echo "========================================"
+    echo "model name :" $1
-    echo "batch size : $batch_size" >> profile_log
+    echo "thread num :" $thread_num
+    echo "batch size :" $batch_size
+    echo "=================Done===================="
+    echo "model name :$1" >> profile_log
+    echo "batch size :$batch_size" >> profile_log
    $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
-    tail -n 1 profile >> profile_log
+    tail -n 8 profile >> profile_log
 done
 done
+ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/util/show_profile.py
+++ b/python/examples/util/show_profile.py
@@ -31,7 +31,7 @@ with open(profile_file) as f:
        if line[0] == "PROFILE":
            prase(line[2])
-print("thread num {}".format(thread_num))
+print("thread num :{}".format(thread_num))
 for name in time_dict:
-    print("{} cost {} s in each thread ".format(name, time_dict[name] / (
+    print("{} cost :{} s in each thread ".format(name, time_dict[name] / (
        1000000.0 * float(thread_num))))
--- a/python/paddle_serving_client/utils/__init__.py
+++ b/python/paddle_serving_client/utils/__init__.py
@@ -17,6 +17,7 @@ import sys
 import subprocess
 import argparse
 from multiprocessing import Pool
+import numpy as np
 def benchmark_args():
@@ -35,6 +36,17 @@ def benchmark_args():
    return parser.parse_args()
+def show_latency(latency_list):
+    latency_array = np.array(latency_list)
+    info = "latency:\n"
+    info += "mean :{} ms\n".format(np.mean(latency_array))
+    info += "median :{} ms\n".format(np.median(latency_array))
+    info += "80 percent :{} ms\n".format(np.percentile(latency_array, 80))
+    info += "90 percent :{} ms\n".format(np.percentile(latency_array, 90))
+    info += "99 percent :{} ms\n".format(np.percentile(latency_array, 99))
+    sys.stderr.write(info)
 class MultiThreadRunner(object):
    def __init__(self):
        pass