diff --git a/python/examples/bert/parse_profile.py b/python/examples/bert/parse_profile.py new file mode 100644 index 0000000000000000000000000000000000000000..fcc6e05b5ecb0120ae6f31c1f6b8ba8549a5d92d --- /dev/null +++ b/python/examples/bert/parse_profile.py @@ -0,0 +1,94 @@ +import sys +import os +## general info +cuda_version = "" +cudnn_version = "" +trt_version = "" +python_version = "" +gcc_version = "" +paddle_version = "" +cpu = "" +gpu = "" +xpu = "" +api = "" +owner = "" +## model info +model_name = "" +model_type = "" +model_source = "" +model_url = "" +## data info +batch_size = "" +num_of_samples = "" +input_shape = "" +## conf info +runtime_device = "" +ir_optim = "" +enable_memory_optim = "" +enable_tensorrt = "" +precision = "" +enable_mkldnn = "" +cpu_math_library_num_threads = "" +## acc info +acc1 = "" +acc5 = "" +## perf info +average_latency, QPS = "", "" +process_latency = "" +cpu_rss, vms, shared, dirty, cpu_usage = "", "", "", "", "" +gpu_id, total, free, used, gpu_utilization_rate, gpu_mem_utilization_rate = "","","","","", "" + +class LogHandler(object): + def __init__(self): + self.fstr = "" + + def print(self): + print(self.fstr) + + def dump(self): + with open("inference_profile.log",'w') as fout: + fout.write(self.fstr) + + def append(self, new_str): + self.fstr += new_str + "\n" + +fh = LogHandler() + +fh.append("cuda_version: {}".format(cuda_version)) +fh.append("cudnn_version: {}".format(cudnn_version)) +fh.append("trt_version: {} ".format(trt_version)) +fh.append("python_version: {}".format(python_version)) +fh.append("gcc_version: {}".format(gcc_version)) +fh.append("paddle_version: {}".format(paddle_version)) +fh.append("cpu: {}".format(cpu)) +fh.append("gpu: {}".format(gpu)) # p4, v100, 1080 +fh.append("xpu: {}".format(xpu)) +fh.append("api: {}".format(api)) +fh.append("owner: {}".format(owner)) +fh.append("----------------------- Model info ----------------------") +fh.append("model_name: {}".format(model_name)) +fh.append("model_type: {}".format(model_type)) +fh.append("model_source: {}".format(model_source)) +fh.append("model_url: {}".format(model_url)) +fh.append("----------------------- Data info -----------------------") +fh.append("batch_size: {}".format(batch_size)) +fh.append("num_of_samples: {}".format(num_of_samples)) +fh.append("input_shape: {}".format(input_shape)) +fh.append("----------------------- Conf info -----------------------") +fh.append("runtime_device: {}".format(runtime_device)) +fh.append("ir_optim: {}".format(ir_optim)) +fh.append("enable_memory_optim: {}".format(enable_memory_optim)) +fh.append("enable_tensorrt: {}".format(enable_tensorrt)) +fh.append("precision: {}".format(precision)) # fp32, fp16, int8 +fh.append("enable_mkldnn: {}".format(enable_mkldnn)) +fh.append("cpu_math_library_num_threads: {}".format(cpu_math_library_num_threads)) +fh.append("----------------------- Acc info ------------------------") +fh.append("acc1:".format(acc1)) +fh.append("acc5:".format(acc5)) +fh.append("----------------------- Perf info -----------------------") +fh.append("average_latency(ms): {}, QPS: {}".format(average_latency, QPS)) +fh.append("process_latency(ms): {}".format(process_latency)) +fh.append("process_name: clas_benchmark, cpu_rss(MB): {}, vms(MB): {}, shared(MB): {}, dirty(MB): {}, cpu_usage(%): {}".format(cpu_rss, vms, shared, dirty, cpu_usage)) +fh.append("gpu_id: {}, total(MB): {}, free(MB): {}, used(MB): {}, gpu_utilization_rate(%): {}, gpu_mem_utilization_rate(%): {}".format(gpu_id, total, free, used, gpu_utilization_rate, gpu_mem_utilization_rate)) + +fh.dump() diff --git a/python/paddle_serving_server_gpu/web_service.py b/python/paddle_serving_server_gpu/web_service.py index 67b78926688076fe911f871638d774bf6275b728..36013b80dbeeaf223bb23f0a68c3a4bd5d263204 100644 --- a/python/paddle_serving_server_gpu/web_service.py +++ b/python/paddle_serving_server_gpu/web_service.py @@ -14,6 +14,7 @@ #!flask/bin/python # pylint: disable=doc-string-missing +from time import time as _time from flask import Flask, request, abort from contextlib import closing from multiprocessing import Pool, Process, Queue @@ -24,7 +25,8 @@ import socket import sys import numpy as np import paddle_serving_server_gpu as serving - +import collections +from .profiler import TimeProfiler, PerformanceTracer from paddle_serving_server_gpu import pipeline from paddle_serving_server_gpu.pipeline import Op @@ -51,6 +53,15 @@ class WebService(object): def get_pipeline_response(self, read_op): return None + def setup_profile(self, trace_interval=10, thread_num=1): + self.is_profile = True + if self.is_profile: + self._tracer = PerformanceTracer(True, 10 ,1) + self.trace_buffer = self._tracer.data_buffer() + self._profiler = TimeProfiler() + self._profiler.enable(True) + self.data_id = 0 + def prepare_pipeline_config(self, yaml_file): # build dag read_op = pipeline.RequestOp() @@ -208,20 +219,62 @@ class WebService(object): abort(400) if "fetch" not in request.json: abort(400) + start_call, end_call = None, None + if self.is_profile: + trace_que = collections.deque() + start_call = self._profiler.record("call_{}".format(self.data_id)) try: + start = int(round(_time() * 1000000)) feed, fetch, is_batch = self.preprocess(request.json["feed"], request.json["fetch"]) if isinstance(feed, dict) and "fetch" in feed: del feed["fetch"] if len(feed) == 0: raise ValueError("empty input") + end = int(round(_time() * 1000000)) + prep_time = end - start + start = int(round(_time() * 1000000)) fetch_map = self.client.predict( feed=feed, fetch=fetch, batch=is_batch) + end = int(round(_time() * 1000000)) + midp_time = end - start + start = int(round(_time() * 1000000)) result = self.postprocess( feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map) result = {"result": result} + end = int(round(_time() * 1000000)) + postp_time = end - start + succ = 1 except ValueError as err: + succ = 0 result = {"result": str(err)} + if self.is_profile: + end_call = self._profiler.record("call_{}".format(self.data_id)) + self.data_id += 1 + if self.trace_buffer is not None: + self.trace_buffer.put({ + "name": "DAG", + "id": self.data_id, + "succ": succ, + "actions": { + "call_{}".format(self.data_id): end_call - start_call, + }, + }) + trace_que.append({ + "name": "demo", + "actions": { + "prep": prep_time, + "midp": midp_time, + "postp": postp_time + } + }) + while trace_que: + info = trace_que[0] + try: + self.trace_buffer.put_nowait(info) + trace_que.popleft() + except Queue.Full: + break return result def run_rpc_service(self): @@ -281,6 +334,8 @@ class WebService(object): "{}".format(self.model_config), use_gpu=True, gpu_id=self.gpus[0]) def run_web_service(self): + if self.is_profile: + self._tracer.start() print("This API will be deprecated later. Please do not use it") self.app_instance.run(host="0.0.0.0", port=self.port, threaded=True)