# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ======================================================================= import json import re import numpy as np import requests import tritonclient.http as httpclient from attrdict import AttrDict from tritonclient.utils import InferenceServerException def convert_http_metadata_config(metadata): metadata = AttrDict(metadata) return metadata def prepare_request(inputs_meta, inputs_data, outputs_meta): ''' inputs_meta: inputs meta information from model. name: info inputs_data: users input data. name: data ''' # Set the input data inputs = [] for input_dict in inputs_meta: input_name = input_dict['name'] if input_name not in inputs_data: raise RuntimeError( 'Error: input name {} required for model not existed.'.format( input_name)) if input_dict['datatype'] == 'FP32': inputs_data[input_name] = inputs_data[input_name].astype( np.float32 ) / 255 # image data returned by gradio is uint8, convert to fp32 if len(input_dict['shape'] ) == 3 and input_dict['shape'][0] == 3: # NCHW inputs_data[input_name] = inputs_data[input_name][0].transpose( 2, 0, 1) elif len(input_dict['shape'] ) == 4 and input_dict['shape'][1] == 3: # NCHW inputs_data[input_name] = inputs_data[input_name].transpose( 0, 3, 1, 2) infer_input = httpclient.InferInput( input_name, inputs_data[input_name].shape, input_dict['datatype']) infer_input.set_data_from_numpy(inputs_data[input_name]) inputs.append(infer_input) outputs = [] for output_dict in outputs_meta: infer_output = httpclient.InferRequestedOutput(output_dict.name) outputs.append(infer_output) return inputs, outputs metrics_table_head = """
{}
模型名称 执行统计 延迟统计
请求处理成功数 请求处理失败数 推理batch数 推理样本数 请求处理时间(ms) 任务队列等待时间(ms) 输入处理时间(ms) 模型推理时间(ms) 输出处理时间(ms)





{}
GPU 性能指标 显存
利用率(%) 功率(W) 功率限制(W) 耗电量(W) 总量(GB) 已使用(GB)
""" def get_metric_data(server_addr, metric_port): # noqa:C901 ''' Get metrics data from fastdeploy server, and transform it into html table. Args: server_addr(str): fastdeployserver ip address metric_port(int): fastdeployserver metrics port Returns: htmltable(str): html table to show metrics data ''' model_table = {} gpu_table = {} metric_column_name = { "Model": { "nv_inference_request_success", "nv_inference_request_failure", "nv_inference_count", "nv_inference_exec_count", "nv_inference_request_duration_us", "nv_inference_queue_duration_us", "nv_inference_compute_input_duration_us", "nv_inference_compute_infer_duration_us", "nv_inference_compute_output_duration_us" }, "GPU": { "nv_gpu_power_usage", "nv_gpu_power_limit", "nv_energy_consumption", "nv_gpu_utilization", "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" }, "CPU": { "nv_cpu_utilization", "nv_cpu_memory_total_bytes", "nv_cpu_memory_used_bytes" } } try: res = requests.get("http://{}:{}/metrics".format( server_addr, metric_port)) except Exception: return metrics_table_head.format('', '') metric_content = res.text for content in metric_content.split('\n'): if content.startswith('#'): continue else: res = re.match(r'(\w+){(.*)} (\w+)', content) # match output by server metrics interface if not res: continue metric_name = res.group(1) model = res.group(2) value = res.group(3) infos = {} for info in model.split(','): k, v = info.split('=') v = v.strip('"') infos[k] = v if metric_name in [ "nv_inference_request_duration_us", "nv_inference_queue_duration_us", "nv_inference_compute_input_duration_us", "nv_inference_compute_infer_duration_us", "nv_inference_compute_output_duration_us" ]: value = str(float(value) / 1000) elif metric_name in [ "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" ]: value = str(float(value) / 1024 / 1024 / 1024) for key, metric_names in metric_column_name.items(): if metric_name in metric_names: if key == 'Model': model_name = infos['model'] if model_name not in model_table: model_table[model_name] = {} model_table[model_name][metric_name] = value elif key == 'GPU': gpu_name = infos['gpu_uuid'] if gpu_name not in gpu_table: gpu_table[gpu_name] = {} gpu_table[gpu_name][metric_name] = value elif key == 'CPU': pass model_data_list = [] gpu_data_list = [] model_data_metric_names = [ "nv_inference_request_success", "nv_inference_request_failure", "nv_inference_exec_count", "nv_inference_count", "nv_inference_request_duration_us", "nv_inference_queue_duration_us", "nv_inference_compute_input_duration_us", "nv_inference_compute_infer_duration_us", "nv_inference_compute_output_duration_us" ] gpu_data_metric_names = [ "nv_gpu_utilization", "nv_gpu_power_usage", "nv_gpu_power_limit", "nv_energy_consumption", "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" ] for k, v in model_table.items(): data = [] data.append(k) for data_metric in model_data_metric_names: data.append(v[data_metric]) model_data_list.append(data) for k, v in gpu_table.items(): data = [] data.append(k) for data_metric in gpu_data_metric_names: data.append(v[data_metric]) gpu_data_list.append(data) model_data = '\n'.join([ "" + '\n'.join(["" + item + "" for item in data]) + "" for data in model_data_list ]) gpu_data = '\n'.join([ "" + '\n'.join(["" + item + "" for item in data]) + "" for data in gpu_data_list ]) return metrics_table_head.format(model_data, gpu_data) class HttpClientManager: def __init__(self): self.clients = {} # server url: httpclient def _create_client(self, server_url): if server_url in self.clients: return self.clients[server_url] try: fastdeploy_client = httpclient.InferenceServerClient(server_url) self.clients[server_url] = fastdeploy_client return fastdeploy_client except Exception: raise RuntimeError( 'Can not connect to server {}, please check your \ server address'.format(server_url)) def infer(self, server_url, model_name, model_version, inputs): fastdeploy_client = self._create_client(server_url) input_metadata, output_metadata = self.get_model_meta( server_url, model_name, model_version) inputs, outputs = prepare_request(input_metadata, inputs, output_metadata) response = fastdeploy_client.infer( model_name, inputs, model_version=model_version, outputs=outputs) results = {} for output in output_metadata: result = response.as_numpy(output.name) # datatype: numpy if output.datatype == 'BYTES': # datatype: bytes try: value = result if len(result.shape) == 1: value = result[0] elif len(result.shape) == 2: value = result[0][0] elif len(result.shape) == 3: value = result[0][0][0] result = json.loads(value) # datatype: json except Exception: pass else: result = result[0] results[output.name] = result return results def raw_infer(self, server_url, model_name, model_version, raw_input): url = 'http://{}/v2/models/{}/versions/{}/infer'.format( server_url, model_name, model_version) res = requests.post(url, data=json.dumps(json.loads(raw_input))) return json.dumps(res.json()) def get_model_meta(self, server_url, model_name, model_version): fastdeploy_client = self._create_client(server_url) try: model_metadata = fastdeploy_client.get_model_metadata( model_name=model_name, model_version=model_version) except InferenceServerException as e: raise RuntimeError("Failed to retrieve the metadata: " + str(e)) model_metadata = convert_http_metadata_config(model_metadata) input_metadata = model_metadata.inputs output_metadata = model_metadata.outputs return input_metadata, output_metadata