benchmark_with_profile.sh

export CUDA_VISIBLE_DEVICES=0,1
python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1 2> elog > stdlog &
export FLAGS_profile_client=1
export FLAGS_profile_server=1
sleep 5
thread_num=4
python benchmark_batch.py --thread ${thread_num} --batch_size 64 --model serving_client_conf/serving_client_conf.prototxt 2> profile

python show_profile.py profile ${thread_num}
python timeline_trace.py profile trace