From ce9d9238ee2ce52fe81b2f6c60950390ddaaa527 Mon Sep 17 00:00:00 2001 From: root <18561538392@163.com> Date: Sun, 22 Nov 2020 18:12:56 +0800 Subject: [PATCH] modify faster_rcnn_model benchmark --- .../examples/faster_rcnn_model/benchmark.py | 125 ++++++++++++++++++ .../examples/faster_rcnn_model/benchmark.sh | 52 ++++++++ python/examples/faster_rcnn_model/result | 100 ++++++++++++++ 3 files changed, 277 insertions(+) create mode 100755 python/examples/faster_rcnn_model/benchmark.py create mode 100755 python/examples/faster_rcnn_model/benchmark.sh create mode 100755 python/examples/faster_rcnn_model/result diff --git a/python/examples/faster_rcnn_model/benchmark.py b/python/examples/faster_rcnn_model/benchmark.py new file mode 100755 index 00000000..19303123 --- /dev/null +++ b/python/examples/faster_rcnn_model/benchmark.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=doc-string-missing + +from __future__ import unicode_literals, absolute_import +import os +import sys +import time +import json +import requests +from paddle_serving_client import Client +from paddle_serving_client.utils import MultiThreadRunner +from paddle_serving_client.utils import benchmark_args, show_latency +from paddle_serving_app.reader import ChineseBertReader + +from paddle_serving_app.reader import * +import numpy as np + + + +args = benchmark_args() + + +def single_func(idx, resource): + img="./000000570688.jpg" + profile_flags = False + latency_flags = False + if os.getenv("FLAGS_profile_client"): + profile_flags = True + if os.getenv("FLAGS_serving_latency"): + latency_flags = True + latency_list = [] + + if args.request == "rpc": + preprocess = Sequential([ + File2Image(), BGR2RGB(), Div(255.0), + Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False), + Resize(640, 640), Transpose((2, 0, 1)) + ]) + + postprocess = RCNNPostprocess("label_list.txt", "output") + client = Client() + + client.load_client_config(args.model) + client.connect([resource["endpoint"][idx % len(resource["endpoint"])]]) + + start = time.time() + for i in range(turns): + if args.batch_size >= 1: + l_start = time.time() + feed_batch = [] + b_start = time.time() + im = preprocess(img) + for bi in range(args.batch_size): + print("1111batch") + print(bi) + feed_batch.append({"image": im, + "im_info": np.array(list(im.shape[1:]) + [1.0]), + "im_shape": np.array(list(im.shape[1:]) + [1.0])}) + # im = preprocess(img) + b_end = time.time() + + if profile_flags: + sys.stderr.write( + "PROFILE\tpid:{}\tbert_pre_0:{} bert_pre_1:{}\n".format( + os.getpid(), + int(round(b_start * 1000000)), + int(round(b_end * 1000000)))) + #result = client.predict(feed=feed_batch, fetch=fetch) + fetch_map = client.predict( + feed=feed_batch, + fetch=["multiclass_nms"]) + fetch_map["image"] = img + postprocess(fetch_map) + + l_end = time.time() + if latency_flags: + latency_list.append(l_end * 1000 - l_start * 1000) + else: + print("unsupport batch size {}".format(args.batch_size)) + else: + raise ValueError("not implemented {} request".format(args.request)) + end = time.time() + if latency_flags: + return [[end - start], latency_list] + else: + return [[end - start]] + + +if __name__ == '__main__': + multi_thread_runner = MultiThreadRunner() + endpoint_list = [ + "127.0.0.1:7777" + ] + turns = 10 + start = time.time() + result = multi_thread_runner.run( + single_func, args.thread, {"endpoint": endpoint_list,"turns": turns}) + end = time.time() + total_cost = end - start + + avg_cost = 0 + for i in range(args.thread): + avg_cost += result[0][i] + avg_cost = avg_cost / args.thread + + print("total cost: {}s".format(total_cost)) + print("each thread cost: {}s. ".format(avg_cost)) + print("qps: {}samples/s".format(args.batch_size * args.thread * turns / + total_cost)) + if os.getenv("FLAGS_serving_latency"): + show_latency(result[1]) diff --git a/python/examples/faster_rcnn_model/benchmark.sh b/python/examples/faster_rcnn_model/benchmark.sh new file mode 100755 index 00000000..5706fd03 --- /dev/null +++ b/python/examples/faster_rcnn_model/benchmark.sh @@ -0,0 +1,52 @@ +rm profile_log* +export CUDA_VISIBLE_DEVICES=0 +export FLAGS_profile_server=1 +export FLAGS_profile_client=1 +export FLAGS_serving_latency=1 + +gpu_id=0 +#save cpu and gpu utilization log +if [ -d utilization ];then + rm -rf utilization +else + mkdir utilization +fi +#start server +$PYTHONROOT/bin/python3 -m paddle_serving_server_gpu.serve --model $1 --port 7777 --thread 4 --gpu_ids 0 --ir_optim > elog 2>&1 & +sleep 5 + +#warm up +$PYTHONROOT/bin/python3 benchmark.py --thread 4 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1 +echo -e "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py +for thread_num in 1 4 8 16 +do +for batch_size in 1 +do + job_bt=`date '+%Y%m%d%H%M%S'` + nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 & + nvidia-smi --id=0 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 & + gpu_memory_pid=$! + $PYTHONROOT/bin/python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1 + kill ${gpu_memory_pid} + kill `ps -ef|grep used_memory|awk '{print $2}'` + echo "model_name:" $1 + echo "thread_num:" $thread_num + echo "batch_size:" $batch_size + echo "=================Done====================" + echo "model_name:$1" >> profile_log_$1 + echo "batch_size:$batch_size" >> profile_log_$1 + $PYTHONROOT/bin/python3 cpu_utilization.py >> profile_log_$1 + job_et=`date '+%Y%m%d%H%M%S'` + awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$1 + awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$1 + rm -rf gpu_use.log gpu_utilization.log + $PYTHONROOT/bin/python3 ../util/show_profile.py profile $thread_num >> profile_log_$1 + tail -n 8 profile >> profile_log_$1 + echo "" >> profile_log_$1 +done +done + +#Divided log +awk 'BEGIN{RS="\n\n"}{i++}{print > "bert_log_"i}' profile_log_$1 +mkdir bert_log && mv bert_log_* bert_log +ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9 diff --git a/python/examples/faster_rcnn_model/result b/python/examples/faster_rcnn_model/result new file mode 100755 index 00000000..573669f7 --- /dev/null +++ b/python/examples/faster_rcnn_model/result @@ -0,0 +1,100 @@ +model_name:pddet_serving_model +batch_size:1 +CPU_UTILIZATION: 0.0 +MAX_GPU_MEMORY: 14525 +GPU_UTILIZATION: 100 +thread_num: 1 +prepro cost: 0.044376s in each thread +client_infer cost: 4.227083s in each thread +op0 cost: 0.015847s in each thread +op1 cost: 3.990032s in each thread +op2 cost: 9.7e-05s in each thread +postpro cost: 0.000244s in each thread +bert_pre cost: 0.304728s in each thread +py_prepro cost: 0.000431s in each thread +py_client cost: 4.273316s in each thread +py_postpro cost: 0.000703s in each thread +mean: 494.598486328125ms +median: 480.2005615234375ms +80 percent: 486.3544921875ms +90 percent: 508.5200439453124ms +99 percent: 624.6452905273438ms +total cost: 5.024378299713135s +each thread cost: 4.9460344314575195s. +qps: 1.990295993550276samples/s + +model_name:pddet_serving_model +batch_size:1 +CPU_UTILIZATION: 0.0 +MAX_GPU_MEMORY: 14525 +GPU_UTILIZATION: 100 +thread_num: 4 +prepro cost: 0.0502565s in each thread +client_infer cost: 14.9771025s in each thread +op0 cost: 0.013033s in each thread +op1 cost: 14.754957s in each thread +op2 cost: 0.00012475s in each thread +postpro cost: 0.00036225s in each thread +bert_pre cost: 0.306132s in each thread +py_prepro cost: 0.000511s in each thread +py_client cost: 15.03027975s in each thread +py_postpro cost: 0.0009275s in each thread +mean: 1569.41435546875ms +median: 1614.8760986328125ms +80 percent: 1799.3856445312506ms +90 percent: 2011.609326171875ms +99 percent: 2379.27158203125ms +total cost: 16.35568356513977s +each thread cost: 15.694196701049805s. +qps: 2.4456330327431455samples/s + +model_name:pddet_serving_model +batch_size:1 +CPU_UTILIZATION: 0.1 +MAX_GPU_MEMORY: 14525 +GPU_UTILIZATION: 100 +thread_num: 8 +prepro cost: 0.0546985s in each thread +client_infer cost: 31.083384375s in each thread +op0 cost: 0.0140595s in each thread +op1 cost: 16.07133675s in each thread +op2 cost: 0.000132625s in each thread +postpro cost: 0.000318375s in each thread +bert_pre cost: 0.31432075s in each thread +py_prepro cost: 0.00053575s in each thread +py_client cost: 31.140613125s in each thread +py_postpro cost: 0.000807375s in each thread +mean: 3181.2632019042967ms +median: 3290.6607666015625ms +80 percent: 3338.09208984375ms +90 percent: 3686.9481689453123ms +99 percent: 3735.27556640625ms +total cost: 33.31558895111084s +each thread cost: 31.812688767910004s. +qps: 2.40127827598655samples/s + +model_name:pddet_serving_model +batch_size:1 +CPU_UTILIZATION: 0.0 +MAX_GPU_MEMORY: 14525 +GPU_UTILIZATION: 100 +thread_num: 16 +prepro cost: 0.0592799375s in each thread +client_infer cost: 62.949139375s in each thread +op0 cost: 0.0134921875s in each thread +op1 cost: 16.5226278125s in each thread +op2 cost: 0.00015525s in each thread +postpro cost: 0.0003169375s in each thread +bert_pre cost: 0.3272226875s in each thread +py_prepro cost: 0.000590125s in each thread +py_client cost: 63.0108379375s in each thread +py_postpro cost: 0.0008313125s in each thread +mean: 6370.063188171387ms +median: 6705.1651611328125ms +80 percent: 7052.77333984375ms +90 percent: 7165.431909179687ms +99 percent: 8213.415532226561ms +total cost: 67.53448605537415s +each thread cost: 63.70069542527199s. +qps: 2.3691599558307113samples/s + -- GitLab