diff --git a/core/general-server/op/general_dist_kv_infer_op.cpp b/core/general-server/op/general_dist_kv_infer_op.cpp
index 9c6c70352b5387fab95acd16cdf79aa2b46f6122..adaa6cbc1818fc5300faf662d98ad47c9af4c468 100644
--- a/core/general-server/op/general_dist_kv_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_infer_op.cpp
@@ -90,6 +90,9 @@ int GeneralDistKVInferOp::inference() {
               keys.begin() + key_idx);
     key_idx += dataptr_size_pairs[i].second;
   }
+  Timer timeline;
+  int64_t cube_start = timeline.TimeStampUS();
+  timeline.Start();
   rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
   std::vector<std::string> table_names = cube->get_table_names();
   if (table_names.size() == 0) {
@@ -97,7 +100,7 @@ int GeneralDistKVInferOp::inference() {
     return -1;
   }
   int ret = cube->seek(table_names[0], keys, &values);
-
+  int64_t cube_end = timeline.TimeStampUS();
   if (values.size() != keys.size() || values[0].buff.size() == 0) {
     LOG(ERROR) << "cube value return null";
   }
@@ -153,9 +156,7 @@ int GeneralDistKVInferOp::inference() {
 
   VLOG(2) << "infer batch size: " << batch_size;
 
-  Timer timeline;
   int64_t start = timeline.TimeStampUS();
-  timeline.Start();
 
   if (InferManager::instance().infer(
           engine_name().c_str(), &infer_in, out, batch_size)) {
@@ -165,6 +166,8 @@ int GeneralDistKVInferOp::inference() {
 
   int64_t end = timeline.TimeStampUS();
   CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, cube_start);
+  AddBlobInfo(output_blob, cube_end);
   AddBlobInfo(output_blob, start);
   AddBlobInfo(output_blob, end);
   return 0;
diff --git a/python/examples/criteo_ctr_with_cube/benchmark.py b/python/examples/criteo_ctr_with_cube/benchmark.py
index a850d244b0a5a1a01e98a6207fa9674b6ea0af1a..5e952474a9bb003ab819b61eb2321374bf19539c 100755
--- a/python/examples/criteo_ctr_with_cube/benchmark.py
+++ b/python/examples/criteo_ctr_with_cube/benchmark.py
@@ -71,14 +71,17 @@ if __name__ == '__main__':
     multi_thread_runner = MultiThreadRunner()
     endpoint_list = ["127.0.0.1:9292"]
     #result = single_func(0, {"endpoint": endpoint_list})
+    start = time.time()
     result = multi_thread_runner.run(single_func, args.thread,
                                      {"endpoint": endpoint_list})
-    print(result)
+    end = time.time()
+    total_cost = end - start
     avg_cost = 0
     qps = 0
     for i in range(args.thread):
         avg_cost += result[0][i * 2 + 0]
         qps += result[0][i * 2 + 1]
     avg_cost = avg_cost / args.thread
+    print("total cost: {}".format(total_cost))
     print("average total cost {} s.".format(avg_cost))
     print("qps {} ins/s".format(qps))
diff --git a/python/examples/criteo_ctr_with_cube/benchmark.sh b/python/examples/criteo_ctr_with_cube/benchmark.sh
index 35b19b637d9e8dec10fd3b59224c5c17e3ba5f53..21daf9331ec4a7ba98ac73fc4570b024681aa06a 100755
--- a/python/examples/criteo_ctr_with_cube/benchmark.sh
+++ b/python/examples/criteo_ctr_with_cube/benchmark.sh
@@ -1,9 +1,23 @@
 rm profile_log
 export FLAGS_profile_client=1
 export FLAGS_profile_server=1
-for thread_num in 1 2 4 8 16
+
+wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz --no-check-certificate
+tar xf ctr_cube_unittest.tar.gz
+mv models/ctr_client_conf ./
+mv models/ctr_serving_model_kv ./
+mv models/data ./cube/
+
+wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz --no-check-certificate
+tar xf cube_app.tar.gz
+mv cube_app/cube* ./cube/
+sh cube_prepare.sh &
+
+python test_server.py ctr_serving_model_kv > serving_log 2>&1 &
+
+for thread_num in 1 4 16
 do
-for batch_size in 1 4 16 64 256
+for batch_size in 1 4 16 64
 do
     $PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1
     echo "batch size : $batch_size"
@@ -11,6 +25,8 @@ do
     echo "========================================"
     echo "batch size : $batch_size" >> profile_log
     $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
-    tail -n 2 profile >> profile_log
+    tail -n 3 profile >> profile_log
 done
 done
+
+ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9