未验证 提交 d7f7f1d5 编写于 作者: J Jiawei Wang 提交者: GitHub

Merge branch 'v0.6.0' into v0.6.0

......@@ -59,7 +59,7 @@ message SimpleResponse { required int32 err_code = 1; }
message GetClientConfigRequest {}
message GetClientConfigResponse { repeated string client_config_str_list = 1; }
message GetClientConfigResponse { required string client_config_str = 1; }
service MultiLangGeneralModelService {
rpc Inference(InferenceRequest) returns (InferenceResponse) {}
......
......@@ -153,7 +153,7 @@ cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR/ \
-DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-DOPENCV_DIR=${OPENCV_DIR} \
-DWITH_OPENCV=ON
-DWITH_OPENCV=ON \
-DSERVER=ON ..
make -j10
```
......
......@@ -152,7 +152,7 @@ cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR/ \
-DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-DOPENCV_DIR=${OPENCV_DIR} \
-DWITH_OPENCV=ON
-DWITH_OPENCV=ON \
-DSERVER=ON ..
make -j10
```
......
......@@ -84,3 +84,9 @@ set environmental variable to specify which gpus are used, the command above mea
```
curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
```
## Benchmark
``` shell
bash benchmark.sh bert_seq128_model bert_seq128_client
```
The output log file of benchmark named `profile_log_bert_seq128_model`
......@@ -88,3 +88,13 @@ python bert_web_service_gpu.py bert_seq128_model/ 9292 #启动gpu预测服务
```
curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
```
## 性能测试
``` shell
bash benchmark.sh bert_seq128_model bert_seq128_client
```
性能测试的日志文件为profile_log_bert_seq128_model
如需修改性能测试用例的参数,请修改benchmark.sh中的配置信息。
注意:bert_seq128_model和bert_seq128_client路径后不要加'/'符号,示例需要在GPU机器上运行。
......@@ -21,6 +21,7 @@ import sys
import time
import json
import requests
import numpy as np
from paddle_serving_client import Client
from paddle_serving_client.utils import MultiThreadRunner
from paddle_serving_client.utils import benchmark_args, show_latency
......@@ -56,7 +57,11 @@ def single_func(idx, resource):
feed_batch = []
b_start = time.time()
for bi in range(args.batch_size):
feed_batch.append(reader.process(dataset[bi]))
feed_dict = reader.process(dataset[bi])
for key in feed_dict.keys():
feed_dict[key] = np.array(feed_dict[key]).reshape(
(1, 128, 1))
feed_batch.append(feed_dict)
b_end = time.time()
if profile_flags:
......@@ -65,7 +70,8 @@ def single_func(idx, resource):
os.getpid(),
int(round(b_start * 1000000)),
int(round(b_end * 1000000))))
result = client.predict(feed=feed_batch, fetch=fetch)
result = client.predict(
feed=feed_batch, fetch=fetch, batch=True)
l_end = time.time()
if latency_flags:
......@@ -116,9 +122,7 @@ def single_func(idx, resource):
if __name__ == '__main__':
multi_thread_runner = MultiThreadRunner()
endpoint_list = [
"127.0.0.1:9292", "127.0.0.1:9293", "127.0.0.1:9294", "127.0.0.1:9295"
]
endpoint_list = ["127.0.0.1:9292", "127.0.0.1:9293"]
turns = 100
start = time.time()
result = multi_thread_runner.run(
......
rm profile_log*
export CUDA_VISIBLE_DEVICES=0,1,2,3
export CUDA_VISIBLE_DEVICES=0,1
export FLAGS_profile_server=1
export FLAGS_profile_client=1
export FLAGS_serving_latency=1
......@@ -12,32 +12,35 @@ else
mkdir utilization
fi
#start server
$PYTHONROOT/bin/python3 -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim --ir_optim > elog 2>&1 &
$PYTHONROOT/bin/python3 -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1 --mem_optim --ir_optim > elog 2>&1 &
sleep 5
#warm up
$PYTHONROOT/bin/python3 benchmark.py --thread 4 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
echo -e "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
echo -e "import psutil\nimport time\nwhile True:\n\tcpu_res = psutil.cpu_percent()\n\twith open('cpu.txt', 'a+') as f:\n\t\tf.write(f'{cpu_res}\\\n')\n\ttime.sleep(0.1)" > cpu.py
for thread_num in 1 4 8 16
do
for batch_size in 1 4 16 64
do
job_bt=`date '+%Y%m%d%H%M%S'`
nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_memory_use.log 2>&1 &
nvidia-smi --id=0 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
rm -rf cpu.txt
$PYTHONROOT/bin/python3 cpu.py &
gpu_memory_pid=$!
$PYTHONROOT/bin/python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
kill ${gpu_memory_pid}
kill `ps -ef|grep used_memory|awk '{print $2}'`
kill `ps -ef|grep used_memory|awk '{print $2}'` > /dev/null
kill `ps -ef|grep utilization.gpu|awk '{print $2}'` > /dev/null
kill `ps -ef|grep cpu.py|awk '{print $2}'` > /dev/null
echo "model_name:" $1
echo "thread_num:" $thread_num
echo "batch_size:" $batch_size
echo "=================Done===================="
echo "model_name:$1" >> profile_log_$1
echo "batch_size:$batch_size" >> profile_log_$1
$PYTHONROOT/bin/python3 cpu_utilization.py >> profile_log_$1
job_et=`date '+%Y%m%d%H%M%S'`
awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$1
awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "CPU_UTILIZATION:", max}' cpu.txt >> profile_log_$1
awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_memory_use.log >> profile_log_$1
awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$1
rm -rf gpu_use.log gpu_utilization.log
$PYTHONROOT/bin/python3 ../util/show_profile.py profile $thread_num >> profile_log_$1
......
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
export CUDA_VISIBLE_DEVICES=0,1
python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1 2> elog > stdlog &
export FLAGS_profile_client=1
export FLAGS_profile_server=1
sleep 5
......
......@@ -18,7 +18,7 @@ import sys
import numpy as np
preprocess = Sequential([
File2Image(), BGR2RGB(),
File2Image(), BGR2RGB(),
Normalize([123.675, 116.28, 103.53], [58.395, 57.12, 57.375], False),
Resize((512, 512)), Transpose((2, 0, 1))
])
......@@ -33,6 +33,7 @@ im = preprocess(sys.argv[1])
fetch_map = client.predict(
feed={
"image": im,
"im_shape": np.array([512, 512]),
"scale_factor": np.array([1.0, 1.0]).reshape(-1),
},
fetch=["save_infer_model/scale_0.tmp_1"],
......
......@@ -42,3 +42,9 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
``` shell
curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
```
## Benchmark
``` shell
bash benchmark.sh uci_housing_model uci_housing_client
```
The log file of benchmark named `profile_log_uci_housing_model`
......@@ -43,3 +43,13 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
``` shell
curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
```
## 性能测试
``` shell
bash benchmark.sh uci_housing_model uci_housing_client
```
性能测试的日志文件为profile_log_uci_housing_model
如需修改性能测试用例的参数,请修改benchmark.sh中的配置信息。
注意:uci_housing_model和uci_housing_client路径后不要加'/'符号,示例需要在GPU机器上运行。
......@@ -15,7 +15,7 @@
from paddle_serving_client import Client
from paddle_serving_client.utils import MultiThreadRunner
from paddle_serving_client.utils import benchmark_args
from paddle_serving_client.utils import benchmark_args, show_latency
import time
import paddle
import sys
......@@ -30,6 +30,7 @@ def single_func(idx, resource):
paddle.dataset.uci_housing.train(), buf_size=500),
batch_size=1)
total_number = sum(1 for _ in train_reader())
latency_list = []
if args.request == "rpc":
client = Client()
......@@ -37,12 +38,12 @@ def single_func(idx, resource):
client.connect([args.endpoint])
start = time.time()
for data in train_reader():
#new_data = np.zeros((1, 13)).astype("float32")
#new_data[0] = data[0][0]
#fetch_map = client.predict(feed={"x": new_data}, fetch=["price"], batch=True)
l_start = time.time()
fetch_map = client.predict(feed={"x": data[0][0]}, fetch=["price"])
l_end = time.time()
latency_list.append(l_end * 1000 - l_start * 1000)
end = time.time()
return [[end - start], [total_number]]
return [[end - start], latency_list, [total_number]]
elif args.request == "http":
train_reader = paddle.batch(
paddle.reader.shuffle(
......@@ -50,13 +51,27 @@ def single_func(idx, resource):
batch_size=1)
start = time.time()
for data in train_reader():
l_start = time.time()
r = requests.post(
'http://{}/uci/prediction'.format(args.endpoint),
data={"x": data[0]})
l_end = time.time()
latency_list.append(l_end * 1000 - l_start * 1000)
end = time.time()
return [[end - start], [total_number]]
return [[end - start], latency_list, [total_number]]
start = time.time()
multi_thread_runner = MultiThreadRunner()
result = multi_thread_runner.run(single_func, args.thread, {})
print(result)
end = time.time()
total_cost = end - start
avg_cost = 0
for i in range(args.thread):
avg_cost += result[0][i]
avg_cost = avg_cost / args.thread
print("total cost: {}s".format(total_cost))
print("each thread cost: {}s. ".format(avg_cost))
print("qps: {}samples/s".format(args.batch_size * args.thread / total_cost))
show_latency(result[1])
rm profile_log*
export CUDA_VISIBLE_DEVICES=0,1
export FLAGS_profile_server=1
export FLAGS_profile_client=1
export FLAGS_serving_latency=1
gpu_id=0
#save cpu and gpu utilization log
if [ -d utilization ];then
rm -rf utilization
else
mkdir utilization
fi
#start server
$PYTHONROOT/bin/python3 -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1 --mem_optim --ir_optim > elog 2>&1 &
sleep 5
#warm up
$PYTHONROOT/bin/python3 benchmark.py --thread 4 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
echo -e "import psutil\nimport time\nwhile True:\n\tcpu_res = psutil.cpu_percent()\n\twith open('cpu.txt', 'a+') as f:\n\t\tf.write(f'{cpu_res}\\\n')\n\ttime.sleep(0.1)" > cpu.py
for thread_num in 1 4 8 16
do
for batch_size in 1 4 16 64
do
job_bt=`date '+%Y%m%d%H%M%S'`
nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_memory_use.log 2>&1 &
nvidia-smi --id=0 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
rm -rf cpu.txt
$PYTHONROOT/bin/python3 cpu.py &
gpu_memory_pid=$!
$PYTHONROOT/bin/python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
kill `ps -ef|grep used_memory|awk '{print $2}'` > /dev/null
kill `ps -ef|grep utilization.gpu|awk '{print $2}'` > /dev/null
kill `ps -ef|grep cpu.py|awk '{print $2}'` > /dev/null
echo "model_name:" $1
echo "thread_num:" $thread_num
echo "batch_size:" $batch_size
echo "=================Done===================="
echo "model_name:$1" >> profile_log_$1
echo "batch_size:$batch_size" >> profile_log_$1
job_et=`date '+%Y%m%d%H%M%S'`
awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "CPU_UTILIZATION:", max}' cpu.txt >> profile_log_$1
awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_memory_use.log >> profile_log_$1
awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$1
rm -rf gpu_use.log gpu_utilization.log
$PYTHONROOT/bin/python3 ../util/show_profile.py profile $thread_num >> profile_log_$1
tail -n 8 profile >> profile_log_$1
echo "" >> profile_log_$1
done
done
#Divided log
awk 'BEGIN{RS="\n\n"}{i++}{print > "bert_log_"i}' profile_log_$1
mkdir bert_log && mv bert_log_* bert_log
ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
......@@ -5,6 +5,7 @@ import collections
profile_file = sys.argv[1]
thread_num = sys.argv[2]
time_dict = collections.OrderedDict()
query_count = 0
def prase(line):
......@@ -26,12 +27,15 @@ def prase(line):
with open(profile_file) as f:
query_count = 0
for line in f.readlines():
line = line.strip().split("\t")
if line[0] == "PROFILE":
prase(line[2])
query_count += 1
print("thread_num: {}".format(thread_num))
print("query_count: {}".format(query_count))
for name in time_dict:
print("{} cost: {}s in each thread ".format(name, time_dict[name] / (
1000000.0 * float(thread_num))))
......@@ -554,15 +554,8 @@ class MultiLangClient(object):
get_client_config_req = multi_lang_general_model_service_pb2.GetClientConfigRequest(
)
resp = self.stub_.GetClientConfig(get_client_config_req)
model_config_path_list = resp.client_config_str_list
file_path_list = []
for single_model_config in model_config_path_list:
if os.path.isdir(single_model_config):
file_path_list.append("{}/serving_server_conf.prototxt".format(
single_model_config))
elif os.path.isfile(single_model_config):
file_path_list.append(single_model_config)
self._parse_model_config(file_path_list)
model_config_str = resp.client_config_str
self._parse_model_config(model_config_str)
def _flatten_list(self, nested_list):
for item in nested_list:
......@@ -572,23 +565,10 @@ class MultiLangClient(object):
else:
yield item
def _parse_model_config(self, model_config_path_list):
if isinstance(model_config_path_list, str):
model_config_path_list = [model_config_path_list]
elif isinstance(model_config_path_list, list):
pass
file_path_list = []
for single_model_config in model_config_path_list:
if os.path.isdir(single_model_config):
file_path_list.append("{}/serving_client_conf.prototxt".format(
single_model_config))
elif os.path.isfile(single_model_config):
file_path_list.append(single_model_config)
def _parse_model_config(self, model_config_str):
model_conf = m_config.GeneralModelConfig()
f = open(file_path_list[0], 'r')
model_conf = google.protobuf.text_format.Merge(
str(f.read()), model_conf)
model_conf = google.protobuf.text_format.Merge(model_config_str,
model_conf)
self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
self.feed_types_ = {}
self.feed_shapes_ = {}
......@@ -598,11 +578,6 @@ class MultiLangClient(object):
self.feed_shapes_[var.alias_name] = var.shape
if var.is_lod_tensor:
self.lod_tensor_set_.add(var.alias_name)
if len(file_path_list) > 1:
model_conf = m_config.GeneralModelConfig()
f = open(file_path_list[-1], 'r')
model_conf = google.protobuf.text_format.Merge(
str(f.read()), model_conf)
self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
self.fetch_types_ = {}
for i, var in enumerate(model_conf.fetch_var):
......
......@@ -198,5 +198,14 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
#model_config_path_list is list right now.
#dict should be added when graphMaker is used.
resp = multi_lang_general_model_service_pb2.GetClientConfigResponse()
resp.client_config_str_list[:] = self.model_config_path_list
model_config_str = []
for single_model_config in self.model_config_path_list:
if os.path.isdir(single_model_config):
with open("{}/serving_server_conf.prototxt".format(
single_model_config)) as f:
model_config_str.append(str(f.read()))
elif os.path.isfile(single_model_config):
with open(single_model_config) as f:
model_config_str.append(str(f.read()))
resp.client_config_str = model_config_str[0]
return resp
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册