Merge branch 'v0.6.0' into v0.6.0

d7f7f1d5 · Jiawei Wang · GitHub · 501fe2aa · 45aaff9c · d7f7f1d5
16 changed file
--- a/core/configure/proto/multi_lang_general_model_service.proto
+++ b/core/configure/proto/multi_lang_general_model_service.proto
@@ -59,7 +59,7 @@ message SimpleResponse { required int32 err_code = 1; }

 message GetClientConfigRequest {}

-message GetClientConfigResponse { repeated string client_config_str_list = 1; }
+message GetClientConfigResponse { required string client_config_str = 1; }

 service MultiLangGeneralModelService {
  rpc Inference(InferenceRequest) returns (InferenceResponse) {}

--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -153,7 +153,7 @@ cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR/ \
    -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
    -DOPENCV_DIR=${OPENCV_DIR} \
-    -DWITH_OPENCV=ON
+    -DWITH_OPENCV=ON \
    -DSERVER=ON ..
 make -j10
 ```

--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -152,7 +152,7 @@ cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR/ \
    -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
    -DOPENCV_DIR=${OPENCV_DIR} \
-    -DWITH_OPENCV=ON
+    -DWITH_OPENCV=ON \
    -DSERVER=ON ..
 make -j10
 ```

--- a/python/examples/bert/README.md
+++ b/python/examples/bert/README.md
@@ -84,3 +84,9 @@ set environmental variable to specify which gpus are used, the command above mea
 ```
 curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
 ```
+
+## Benchmark
+``` shell
+bash benchmark.sh bert_seq128_model bert_seq128_client
+```
+The output log file of benchmark named `profile_log_bert_seq128_model`
--- a/python/examples/bert/README_CN.md
+++ b/python/examples/bert/README_CN.md
@@ -88,3 +88,13 @@ python bert_web_service_gpu.py bert_seq128_model/ 9292 #启动gpu预测服务
 ```
 curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
 ```
+
+## 性能测试
+``` shell
+bash benchmark.sh bert_seq128_model bert_seq128_client
+```
+性能测试的日志文件为profile_log_bert_seq128_model
+
+如需修改性能测试用例的参数，请修改benchmark.sh中的配置信息。
+
+注意:bert_seq128_model和bert_seq128_client路径后不要加'/'符号,示例需要在GPU机器上运行。
--- a/python/examples/bert/benchmark.py
+++ b/python/examples/bert/benchmark.py
@@ -21,6 +21,7 @@ import sys
 import time
 import json
 import requests
+import numpy as np
 from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
@@ -56,7 +57,11 @@ def single_func(idx, resource):
                feed_batch = []
                b_start = time.time()
                for bi in range(args.batch_size):
-                    feed_batch.append(reader.process(dataset[bi]))
+                    feed_dict = reader.process(dataset[bi])
+                    for key in feed_dict.keys():
+                        feed_dict[key] = np.array(feed_dict[key]).reshape(
+                            (1, 128, 1))
+                    feed_batch.append(feed_dict)
                b_end = time.time()

                if profile_flags:
@@ -65,7 +70,8 @@ def single_func(idx, resource):
                            os.getpid(),
                            int(round(b_start * 1000000)),
                            int(round(b_end * 1000000))))
-                result = client.predict(feed=feed_batch, fetch=fetch)
+                result = client.predict(
+                    feed=feed_batch, fetch=fetch, batch=True)

                l_end = time.time()
                if latency_flags:
@@ -116,9 +122,7 @@ def single_func(idx, resource):

 if __name__ == '__main__':
    multi_thread_runner = MultiThreadRunner()
-    endpoint_list = [
-        "127.0.0.1:9292", "127.0.0.1:9293", "127.0.0.1:9294", "127.0.0.1:9295"
-    ]
+    endpoint_list = ["127.0.0.1:9292", "127.0.0.1:9293"]
    turns = 100
    start = time.time()
    result = multi_thread_runner.run(

--- a/python/examples/bert/benchmark.sh
+++ b/python/examples/bert/benchmark.sh
 rm profile_log*
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=0,1
 export FLAGS_profile_server=1
 export FLAGS_profile_client=1
 export FLAGS_serving_latency=1
@@ -12,32 +12,35 @@ else
    mkdir utilization
 fi
 #start server
-$PYTHONROOT/bin/python3 -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim  --ir_optim >  elog  2>&1 &
+$PYTHONROOT/bin/python3 -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1 --mem_optim  --ir_optim >  elog  2>&1 &
 sleep 5

 #warm up
 $PYTHONROOT/bin/python3 benchmark.py --thread 4 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
-echo -e "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+echo -e "import psutil\nimport time\nwhile True:\n\tcpu_res = psutil.cpu_percent()\n\twith open('cpu.txt', 'a+') as f:\n\t\tf.write(f'{cpu_res}\\\n')\n\ttime.sleep(0.1)" > cpu.py
 for thread_num in 1 4 8 16
 do
 for batch_size in 1 4 16 64
 do
    job_bt=`date '+%Y%m%d%H%M%S'`
-    nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_memory_use.log 2>&1 &
    nvidia-smi --id=0 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    rm -rf cpu.txt
+    $PYTHONROOT/bin/python3 cpu.py &
    gpu_memory_pid=$!
    $PYTHONROOT/bin/python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
-    kill ${gpu_memory_pid}
-    kill `ps -ef|grep used_memory|awk '{print $2}'`
+    kill `ps -ef|grep used_memory|awk '{print $2}'` > /dev/null
+    kill `ps -ef|grep utilization.gpu|awk '{print $2}'` > /dev/null
+    kill `ps -ef|grep cpu.py|awk '{print $2}'` > /dev/null
    echo "model_name:" $1
    echo "thread_num:" $thread_num
    echo "batch_size:" $batch_size
    echo "=================Done===================="
    echo "model_name:$1" >> profile_log_$1
    echo "batch_size:$batch_size" >> profile_log_$1
-    $PYTHONROOT/bin/python3 cpu_utilization.py >> profile_log_$1
    job_et=`date '+%Y%m%d%H%M%S'`
-    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$1
+    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "CPU_UTILIZATION:", max}' cpu.txt >> profile_log_$1
+    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_memory_use.log >> profile_log_$1
    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$1
    rm -rf gpu_use.log gpu_utilization.log
    $PYTHONROOT/bin/python3 ../util/show_profile.py profile $thread_num >> profile_log_$1

--- a/python/examples/bert/benchmark_with_profile.sh
+++ b/python/examples/bert/benchmark_with_profile.sh
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
+export CUDA_VISIBLE_DEVICES=0,1
+python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1 2> elog > stdlog &
 export FLAGS_profile_client=1
 export FLAGS_profile_server=1
 sleep 5

--- a/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py
@@ -18,7 +18,7 @@ import sys
 import numpy as np

 preprocess = Sequential([
-    File2Image(), BGR2RGB(), 
+    File2Image(), BGR2RGB(),
    Normalize([123.675, 116.28, 103.53], [58.395, 57.12, 57.375], False),
    Resize((512, 512)), Transpose((2, 0, 1))
 ])
@@ -33,6 +33,7 @@ im = preprocess(sys.argv[1])
 fetch_map = client.predict(
    feed={
        "image": im,
+        "im_shape": np.array([512, 512]),
        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
    },
    fetch=["save_infer_model/scale_0.tmp_1"],

--- a/python/examples/fit_a_line/README.md
+++ b/python/examples/fit_a_line/README.md
@@ -42,3 +42,9 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 ``` shell
 curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
 ```
+
+## Benchmark
+``` shell
+bash benchmark.sh uci_housing_model uci_housing_client
+```
+The log file of benchmark named `profile_log_uci_housing_model`
--- a/python/examples/fit_a_line/README_CN.md
+++ b/python/examples/fit_a_line/README_CN.md
@@ -43,3 +43,13 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 ``` shell
 curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
 ```
+
+## 性能测试
+``` shell
+bash benchmark.sh uci_housing_model uci_housing_client
+```
+性能测试的日志文件为profile_log_uci_housing_model
+
+如需修改性能测试用例的参数，请修改benchmark.sh中的配置信息。
+
+注意:uci_housing_model和uci_housing_client路径后不要加'/'符号,示例需要在GPU机器上运行。
--- a/python/examples/fit_a_line/benchmark.py
+++ b/python/examples/fit_a_line/benchmark.py
@@ -15,7 +15,7 @@

 from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
-from paddle_serving_client.utils import benchmark_args
+from paddle_serving_client.utils import benchmark_args, show_latency
 import time
 import paddle
 import sys
@@ -30,6 +30,7 @@ def single_func(idx, resource):
            paddle.dataset.uci_housing.train(), buf_size=500),
        batch_size=1)
    total_number = sum(1 for _ in train_reader())
+    latency_list = []

    if args.request == "rpc":
        client = Client()
@@ -37,12 +38,12 @@ def single_func(idx, resource):
        client.connect([args.endpoint])
        start = time.time()
        for data in train_reader():
-            #new_data = np.zeros((1, 13)).astype("float32")
-            #new_data[0] = data[0][0]
-            #fetch_map = client.predict(feed={"x": new_data}, fetch=["price"], batch=True)
+            l_start = time.time()
            fetch_map = client.predict(feed={"x": data[0][0]}, fetch=["price"])
+            l_end = time.time()
+            latency_list.append(l_end * 1000 - l_start * 1000)
        end = time.time()
-        return [[end - start], [total_number]]
+        return [[end - start], latency_list, [total_number]]
    elif args.request == "http":
        train_reader = paddle.batch(
            paddle.reader.shuffle(
@@ -50,13 +51,27 @@ def single_func(idx, resource):
            batch_size=1)
        start = time.time()
        for data in train_reader():
+            l_start = time.time()
            r = requests.post(
                'http://{}/uci/prediction'.format(args.endpoint),
                data={"x": data[0]})
+            l_end = time.time()
+            latency_list.append(l_end * 1000 - l_start * 1000)
        end = time.time()
-        return [[end - start], [total_number]]
+        return [[end - start], latency_list, [total_number]]


+start = time.time()
 multi_thread_runner = MultiThreadRunner()
 result = multi_thread_runner.run(single_func, args.thread, {})
-print(result)
+end = time.time()
+total_cost = end - start
+avg_cost = 0
+for i in range(args.thread):
+    avg_cost += result[0][i]
+avg_cost = avg_cost / args.thread
+
+print("total cost: {}s".format(total_cost))
+print("each thread cost: {}s. ".format(avg_cost))
+print("qps: {}samples/s".format(args.batch_size * args.thread / total_cost))
+show_latency(result[1])
--- a/python/examples/fit_a_line/benchmark.sh
+++ b/python/examples/fit_a_line/benchmark.sh
+rm profile_log*
+export CUDA_VISIBLE_DEVICES=0,1
+export FLAGS_profile_server=1
+export FLAGS_profile_client=1
+export FLAGS_serving_latency=1
+
+gpu_id=0
+#save cpu and gpu utilization log
+if [ -d utilization ];then
+    rm -rf utilization
+else
+    mkdir utilization
+fi
+#start server
+$PYTHONROOT/bin/python3 -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1 --mem_optim  --ir_optim >  elog  2>&1 &
+sleep 5
+
+#warm up
+$PYTHONROOT/bin/python3 benchmark.py --thread 4 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+echo -e "import psutil\nimport time\nwhile True:\n\tcpu_res = psutil.cpu_percent()\n\twith open('cpu.txt', 'a+') as f:\n\t\tf.write(f'{cpu_res}\\\n')\n\ttime.sleep(0.1)" > cpu.py
+for thread_num in 1 4 8 16
+do
+for batch_size in 1 4 16 64
+do
+    job_bt=`date '+%Y%m%d%H%M%S'`
+    nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_memory_use.log 2>&1 &
+    nvidia-smi --id=0 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    rm -rf cpu.txt
+    $PYTHONROOT/bin/python3 cpu.py &
+    gpu_memory_pid=$!
+    $PYTHONROOT/bin/python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+    kill `ps -ef|grep used_memory|awk '{print $2}'` > /dev/null
+    kill `ps -ef|grep utilization.gpu|awk '{print $2}'` > /dev/null
+    kill `ps -ef|grep cpu.py|awk '{print $2}'` > /dev/null
+    echo "model_name:" $1
+    echo "thread_num:" $thread_num
+    echo "batch_size:" $batch_size
+    echo "=================Done===================="
+    echo "model_name:$1" >> profile_log_$1
+    echo "batch_size:$batch_size" >> profile_log_$1
+    job_et=`date '+%Y%m%d%H%M%S'`
+    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "CPU_UTILIZATION:", max}' cpu.txt >> profile_log_$1
+    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_memory_use.log >> profile_log_$1
+    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$1
+    rm -rf gpu_use.log gpu_utilization.log
+    $PYTHONROOT/bin/python3 ../util/show_profile.py profile $thread_num >> profile_log_$1
+    tail -n 8 profile >> profile_log_$1
+    echo "" >> profile_log_$1
+done
+done
+
+#Divided log
+awk 'BEGIN{RS="\n\n"}{i++}{print > "bert_log_"i}' profile_log_$1
+mkdir bert_log && mv bert_log_* bert_log
+ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/util/show_profile.py
+++ b/python/examples/util/show_profile.py
@@ -5,6 +5,7 @@ import collections
 profile_file = sys.argv[1]
 thread_num = sys.argv[2]
 time_dict = collections.OrderedDict()
+query_count = 0


 def prase(line):
@@ -26,12 +27,15 @@ def prase(line):


 with open(profile_file) as f:
+    query_count = 0
    for line in f.readlines():
        line = line.strip().split("\t")
        if line[0] == "PROFILE":
            prase(line[2])
+            query_count += 1

 print("thread_num: {}".format(thread_num))
+print("query_count: {}".format(query_count))
 for name in time_dict:
    print("{} cost: {}s in each thread ".format(name, time_dict[name] / (
        1000000.0 * float(thread_num))))
--- a/python/paddle_serving_client/client.py
+++ b/python/paddle_serving_client/client.py
@@ -554,15 +554,8 @@ class MultiLangClient(object):
        get_client_config_req = multi_lang_general_model_service_pb2.GetClientConfigRequest(
        )
        resp = self.stub_.GetClientConfig(get_client_config_req)
-        model_config_path_list = resp.client_config_str_list
-        file_path_list = []
-        for single_model_config in model_config_path_list:
-            if os.path.isdir(single_model_config):
-                file_path_list.append("{}/serving_server_conf.prototxt".format(
-                    single_model_config))
-            elif os.path.isfile(single_model_config):
-                file_path_list.append(single_model_config)
-        self._parse_model_config(file_path_list)
+        model_config_str = resp.client_config_str
+        self._parse_model_config(model_config_str)

    def _flatten_list(self, nested_list):
        for item in nested_list:
@@ -572,23 +565,10 @@ class MultiLangClient(object):
            else:
                yield item

-    def _parse_model_config(self, model_config_path_list):
-        if isinstance(model_config_path_list, str):
-            model_config_path_list = [model_config_path_list]
-        elif isinstance(model_config_path_list, list):
-            pass
-
-        file_path_list = []
-        for single_model_config in model_config_path_list:
-            if os.path.isdir(single_model_config):
-                file_path_list.append("{}/serving_client_conf.prototxt".format(
-                    single_model_config))
-            elif os.path.isfile(single_model_config):
-                file_path_list.append(single_model_config)
+    def _parse_model_config(self, model_config_str):
        model_conf = m_config.GeneralModelConfig()
-        f = open(file_path_list[0], 'r')
-        model_conf = google.protobuf.text_format.Merge(
-            str(f.read()), model_conf)
+        model_conf = google.protobuf.text_format.Merge(model_config_str,
+                                                       model_conf)
        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.feed_types_ = {}
        self.feed_shapes_ = {}
@@ -598,11 +578,6 @@ class MultiLangClient(object):
            self.feed_shapes_[var.alias_name] = var.shape
            if var.is_lod_tensor:
                self.lod_tensor_set_.add(var.alias_name)
-        if len(file_path_list) > 1:
-            model_conf = m_config.GeneralModelConfig()
-            f = open(file_path_list[-1], 'r')
-            model_conf = google.protobuf.text_format.Merge(
-                str(f.read()), model_conf)
        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
        self.fetch_types_ = {}
        for i, var in enumerate(model_conf.fetch_var):

--- a/python/paddle_serving_server/rpc_service.py
+++ b/python/paddle_serving_server/rpc_service.py
@@ -198,5 +198,14 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
        #model_config_path_list is list right now.
        #dict should be added when graphMaker is used.
        resp = multi_lang_general_model_service_pb2.GetClientConfigResponse()
-        resp.client_config_str_list[:] = self.model_config_path_list
+        model_config_str = []
+        for single_model_config in self.model_config_path_list:
+            if os.path.isdir(single_model_config):
+                with open("{}/serving_server_conf.prototxt".format(
+                        single_model_config)) as f:
+                    model_config_str.append(str(f.read()))
+            elif os.path.isfile(single_model_config):
+                with open(single_model_config) as f:
+                    model_config_str.append(str(f.read()))
+        resp.client_config_str = model_config_str[0]
        return resp