diff --git a/python/examples/bert/benchmark_batch.py b/python/examples/bert/benchmark_batch.py
index b4d13c7db6b3c32c7e8ccd75c33ce25a196e0ea8..e0f677146a47c0366a1bbafe9eff049e2671a617 100644
--- a/python/examples/bert/benchmark_batch.py
+++ b/python/examples/bert/benchmark_batch.py
@@ -41,13 +41,13 @@ def single_func(idx, resource):
         client = Client()
         client.load_client_config(args.model)
         client.connect([resource["endpoint"][idx % len(resource["endpoint"])]])
+        feed_batch = []
+        for bi in range(args.batch_size):
+            feed_batch.append(reader.process(dataset[bi]))
 
         start = time.time()
         for i in range(1000):
             if args.batch_size >= 1:
-                feed_batch = []
-                for bi in range(args.batch_size):
-                    feed_batch.append(reader.process(dataset[i]))
                 result = client.batch_predict(
                     feed_batch=feed_batch, fetch=fetch)
             else:
@@ -61,7 +61,9 @@ def single_func(idx, resource):
 
 if __name__ == '__main__':
     multi_thread_runner = MultiThreadRunner()
-    endpoint_list = ["127.0.0.1:9292"]
+    endpoint_list = [
+        "127.0.0.1:9295", "127.0.0.1:9296", "127.0.0.1:9297", "127.0.0.1:9298"
+    ]
     result = multi_thread_runner.run(single_func, args.thread,
                                      {"endpoint": endpoint_list})
     avg_cost = 0
diff --git a/python/examples/bert/benchmark_batch.sh b/python/examples/bert/benchmark_batch.sh
index 46ba451d0ade36c24151e260d5c9b3cc3666a548..272923776d6640880175745920a8fad9e84972fd 100644
--- a/python/examples/bert/benchmark_batch.sh
+++ b/python/examples/bert/benchmark_batch.sh
@@ -1,10 +1,17 @@
 rm profile_log
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
+
+sleep 5
+
 for thread_num in 1 2 4 8 16
 do
 for batch_size in 1 2 4 8 16 32 64 128 256 512
 do
     $PYTHONROOT/bin/python benchmark_batch.py --thread $thread_num --batch_size $batch_size --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1
     echo "========================================"
+    echo "thread num: ", $thread_num
+    echo "batch size: ", $batch_size
     echo "batch size : $batch_size" >> profile_log
     $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
     tail -n 1 profile >> profile_log
diff --git a/python/examples/bert/benchmark_with_profile.sh b/python/examples/bert/benchmark_with_profile.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8102e30d5c794d5e21d34e2f4ffd88a1af791b5e
--- /dev/null
+++ b/python/examples/bert/benchmark_with_profile.sh
@@ -0,0 +1,10 @@
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
+export FLAGS_profile_client=1
+export FLAGS_profile_server=1
+sleep 5
+thread_num=4
+python benchmark_batch.py --thread ${thread_num} --batch_size 64 --model serving_client_conf/serving_client_conf.prototxt 2> profile
+
+python show_profile.py profile ${thread_num}
+python timeline_trace.py profile trace