提交 ce9d9238 编写于 作者: R root

modify faster_rcnn_model benchmark

上级 c8c7f4d0
# -*- coding: utf-8 -*-
#
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from __future__ import unicode_literals, absolute_import
import os
import sys
import time
import json
import requests
from paddle_serving_client import Client
from paddle_serving_client.utils import MultiThreadRunner
from paddle_serving_client.utils import benchmark_args, show_latency
from paddle_serving_app.reader import ChineseBertReader
from paddle_serving_app.reader import *
import numpy as np
args = benchmark_args()
def single_func(idx, resource):
img="./000000570688.jpg"
profile_flags = False
latency_flags = False
if os.getenv("FLAGS_profile_client"):
profile_flags = True
if os.getenv("FLAGS_serving_latency"):
latency_flags = True
latency_list = []
if args.request == "rpc":
preprocess = Sequential([
File2Image(), BGR2RGB(), Div(255.0),
Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
Resize(640, 640), Transpose((2, 0, 1))
])
postprocess = RCNNPostprocess("label_list.txt", "output")
client = Client()
client.load_client_config(args.model)
client.connect([resource["endpoint"][idx % len(resource["endpoint"])]])
start = time.time()
for i in range(turns):
if args.batch_size >= 1:
l_start = time.time()
feed_batch = []
b_start = time.time()
im = preprocess(img)
for bi in range(args.batch_size):
print("1111batch")
print(bi)
feed_batch.append({"image": im,
"im_info": np.array(list(im.shape[1:]) + [1.0]),
"im_shape": np.array(list(im.shape[1:]) + [1.0])})
# im = preprocess(img)
b_end = time.time()
if profile_flags:
sys.stderr.write(
"PROFILE\tpid:{}\tbert_pre_0:{} bert_pre_1:{}\n".format(
os.getpid(),
int(round(b_start * 1000000)),
int(round(b_end * 1000000))))
#result = client.predict(feed=feed_batch, fetch=fetch)
fetch_map = client.predict(
feed=feed_batch,
fetch=["multiclass_nms"])
fetch_map["image"] = img
postprocess(fetch_map)
l_end = time.time()
if latency_flags:
latency_list.append(l_end * 1000 - l_start * 1000)
else:
print("unsupport batch size {}".format(args.batch_size))
else:
raise ValueError("not implemented {} request".format(args.request))
end = time.time()
if latency_flags:
return [[end - start], latency_list]
else:
return [[end - start]]
if __name__ == '__main__':
multi_thread_runner = MultiThreadRunner()
endpoint_list = [
"127.0.0.1:7777"
]
turns = 10
start = time.time()
result = multi_thread_runner.run(
single_func, args.thread, {"endpoint": endpoint_list,"turns": turns})
end = time.time()
total_cost = end - start
avg_cost = 0
for i in range(args.thread):
avg_cost += result[0][i]
avg_cost = avg_cost / args.thread
print("total cost: {}s".format(total_cost))
print("each thread cost: {}s. ".format(avg_cost))
print("qps: {}samples/s".format(args.batch_size * args.thread * turns /
total_cost))
if os.getenv("FLAGS_serving_latency"):
show_latency(result[1])
rm profile_log*
export CUDA_VISIBLE_DEVICES=0
export FLAGS_profile_server=1
export FLAGS_profile_client=1
export FLAGS_serving_latency=1
gpu_id=0
#save cpu and gpu utilization log
if [ -d utilization ];then
rm -rf utilization
else
mkdir utilization
fi
#start server
$PYTHONROOT/bin/python3 -m paddle_serving_server_gpu.serve --model $1 --port 7777 --thread 4 --gpu_ids 0 --ir_optim > elog 2>&1 &
sleep 5
#warm up
$PYTHONROOT/bin/python3 benchmark.py --thread 4 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
echo -e "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
for thread_num in 1 4 8 16
do
for batch_size in 1
do
job_bt=`date '+%Y%m%d%H%M%S'`
nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
nvidia-smi --id=0 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
gpu_memory_pid=$!
$PYTHONROOT/bin/python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
kill ${gpu_memory_pid}
kill `ps -ef|grep used_memory|awk '{print $2}'`
echo "model_name:" $1
echo "thread_num:" $thread_num
echo "batch_size:" $batch_size
echo "=================Done===================="
echo "model_name:$1" >> profile_log_$1
echo "batch_size:$batch_size" >> profile_log_$1
$PYTHONROOT/bin/python3 cpu_utilization.py >> profile_log_$1
job_et=`date '+%Y%m%d%H%M%S'`
awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$1
awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$1
rm -rf gpu_use.log gpu_utilization.log
$PYTHONROOT/bin/python3 ../util/show_profile.py profile $thread_num >> profile_log_$1
tail -n 8 profile >> profile_log_$1
echo "" >> profile_log_$1
done
done
#Divided log
awk 'BEGIN{RS="\n\n"}{i++}{print > "bert_log_"i}' profile_log_$1
mkdir bert_log && mv bert_log_* bert_log
ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
model_name:pddet_serving_model
batch_size:1
CPU_UTILIZATION: 0.0
MAX_GPU_MEMORY: 14525
GPU_UTILIZATION: 100
thread_num: 1
prepro cost: 0.044376s in each thread
client_infer cost: 4.227083s in each thread
op0 cost: 0.015847s in each thread
op1 cost: 3.990032s in each thread
op2 cost: 9.7e-05s in each thread
postpro cost: 0.000244s in each thread
bert_pre cost: 0.304728s in each thread
py_prepro cost: 0.000431s in each thread
py_client cost: 4.273316s in each thread
py_postpro cost: 0.000703s in each thread
mean: 494.598486328125ms
median: 480.2005615234375ms
80 percent: 486.3544921875ms
90 percent: 508.5200439453124ms
99 percent: 624.6452905273438ms
total cost: 5.024378299713135s
each thread cost: 4.9460344314575195s.
qps: 1.990295993550276samples/s
model_name:pddet_serving_model
batch_size:1
CPU_UTILIZATION: 0.0
MAX_GPU_MEMORY: 14525
GPU_UTILIZATION: 100
thread_num: 4
prepro cost: 0.0502565s in each thread
client_infer cost: 14.9771025s in each thread
op0 cost: 0.013033s in each thread
op1 cost: 14.754957s in each thread
op2 cost: 0.00012475s in each thread
postpro cost: 0.00036225s in each thread
bert_pre cost: 0.306132s in each thread
py_prepro cost: 0.000511s in each thread
py_client cost: 15.03027975s in each thread
py_postpro cost: 0.0009275s in each thread
mean: 1569.41435546875ms
median: 1614.8760986328125ms
80 percent: 1799.3856445312506ms
90 percent: 2011.609326171875ms
99 percent: 2379.27158203125ms
total cost: 16.35568356513977s
each thread cost: 15.694196701049805s.
qps: 2.4456330327431455samples/s
model_name:pddet_serving_model
batch_size:1
CPU_UTILIZATION: 0.1
MAX_GPU_MEMORY: 14525
GPU_UTILIZATION: 100
thread_num: 8
prepro cost: 0.0546985s in each thread
client_infer cost: 31.083384375s in each thread
op0 cost: 0.0140595s in each thread
op1 cost: 16.07133675s in each thread
op2 cost: 0.000132625s in each thread
postpro cost: 0.000318375s in each thread
bert_pre cost: 0.31432075s in each thread
py_prepro cost: 0.00053575s in each thread
py_client cost: 31.140613125s in each thread
py_postpro cost: 0.000807375s in each thread
mean: 3181.2632019042967ms
median: 3290.6607666015625ms
80 percent: 3338.09208984375ms
90 percent: 3686.9481689453123ms
99 percent: 3735.27556640625ms
total cost: 33.31558895111084s
each thread cost: 31.812688767910004s.
qps: 2.40127827598655samples/s
model_name:pddet_serving_model
batch_size:1
CPU_UTILIZATION: 0.0
MAX_GPU_MEMORY: 14525
GPU_UTILIZATION: 100
thread_num: 16
prepro cost: 0.0592799375s in each thread
client_infer cost: 62.949139375s in each thread
op0 cost: 0.0134921875s in each thread
op1 cost: 16.5226278125s in each thread
op2 cost: 0.00015525s in each thread
postpro cost: 0.0003169375s in each thread
bert_pre cost: 0.3272226875s in each thread
py_prepro cost: 0.000590125s in each thread
py_client cost: 63.0108379375s in each thread
py_postpro cost: 0.0008313125s in each thread
mean: 6370.063188171387ms
median: 6705.1651611328125ms
80 percent: 7052.77333984375ms
90 percent: 7165.431909179687ms
99 percent: 8213.415532226561ms
total cost: 67.53448605537415s
each thread cost: 63.70069542527199s.
qps: 2.3691599558307113samples/s
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册