# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ======================================================================= import json import re import numpy as np import requests import tritonclient.http as httpclient from attrdict import AttrDict def convert_http_metadata_config(metadata): metadata = AttrDict(metadata) return metadata def prepare_request(inputs_meta, inputs_data, outputs_meta): ''' inputs_meta: inputs meta information from model. name: info inputs_data: users input data. name: data ''' # Set the input data inputs = [] for input_dict in inputs_meta: input_name = input_dict['name'] if input_name not in inputs_data: raise RuntimeError( 'Error: input name {} required for model not existed.'.format( input_name)) if input_dict['datatype'] == 'FP32': inputs_data[input_name] = inputs_data[input_name].astype( np.float32 ) / 255 # image data returned by gradio is uint8, convert to fp32 if len(input_dict['shape'] ) == 3 and input_dict['shape'][0] == 3: # NCHW inputs_data[input_name] = inputs_data[input_name][0].transpose( 2, 0, 1) elif len(input_dict['shape'] ) == 4 and input_dict['shape'][1] == 3: # NCHW inputs_data[input_name] = inputs_data[input_name].transpose( 0, 3, 1, 2) infer_input = httpclient.InferInput( input_name, inputs_data[input_name].shape, input_dict['datatype']) infer_input.set_data_from_numpy(inputs_data[input_name]) inputs.append(infer_input) outputs = [] for output_dict in outputs_meta: infer_output = httpclient.InferRequestedOutput(output_dict.name) outputs.append(infer_output) return inputs, outputs metrics_table_head = """
{}
模型名称 执行统计 延迟统计
请求处理成功数 请求处理失败数 推理batch数 推理样本数 请求处理时间(ms) 任务队列等待时间(ms) 输入处理时间(ms) 模型推理时间(ms) 输出处理时间(ms)





{}
GPU 性能指标 显存
利用率(%) 功率(W) 功率限制(W) 耗电量(W) 总量(GB) 已使用(GB)
""" metrics_table_head_en = """
{}
Model name Execution metric Delay metric
inference request success inference request failure inference count inference exec count inference request duration(ms) inference queue duration(ms) inference comput input duration(ms) inference compute infer duration (ms) inference compute output duration(ms)





{}
GPU Performance metric Memory
utilization(%) power usage(W) power limit(W) energy consumption(W) total(GB) used(GB)
""" def get_metric_data(server_addr, metric_port, lang='zh'): # noqa:C901 ''' Get metrics data from fastdeploy server, and transform it into html table. Args: server_addr(str): fastdeployserver ip address metric_port(int): fastdeployserver metrics port Returns: htmltable(str): html table to show metrics data ''' model_table = {} gpu_table = {} metric_column_name = { "Model": { "nv_inference_request_success", "nv_inference_request_failure", "nv_inference_count", "nv_inference_exec_count", "nv_inference_request_duration_us", "nv_inference_queue_duration_us", "nv_inference_compute_input_duration_us", "nv_inference_compute_infer_duration_us", "nv_inference_compute_output_duration_us" }, "GPU": { "nv_gpu_power_usage", "nv_gpu_power_limit", "nv_energy_consumption", "nv_gpu_utilization", "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" }, "CPU": { "nv_cpu_utilization", "nv_cpu_memory_total_bytes", "nv_cpu_memory_used_bytes" } } try: res = requests.get("http://{}:{}/metrics".format( server_addr, metric_port)) except Exception: return metrics_table_head.format('', '') metric_content = res.text for content in metric_content.split('\n'): if content.startswith('#'): continue else: res = re.match(r'(\w+){(.*)} (\w+)', content) # match output by server metrics interface if not res: continue metric_name = res.group(1) model = res.group(2) value = res.group(3) infos = {} for info in model.split(','): k, v = info.split('=') v = v.strip('"') infos[k] = v if metric_name in [ "nv_inference_request_duration_us", "nv_inference_queue_duration_us", "nv_inference_compute_input_duration_us", "nv_inference_compute_infer_duration_us", "nv_inference_compute_output_duration_us" ]: value = str(float(value) / 1000) elif metric_name in [ "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" ]: value = str(float(value) / 1024 / 1024 / 1024) for key, metric_names in metric_column_name.items(): if metric_name in metric_names: if key == 'Model': model_name = infos['model'] if model_name not in model_table: model_table[model_name] = {} model_table[model_name][metric_name] = value elif key == 'GPU': gpu_name = infos['gpu_uuid'] if gpu_name not in gpu_table: gpu_table[gpu_name] = {} gpu_table[gpu_name][metric_name] = value elif key == 'CPU': pass model_data_list = [] gpu_data_list = [] model_data_metric_names = [ "nv_inference_request_success", "nv_inference_request_failure", "nv_inference_exec_count", "nv_inference_count", "nv_inference_request_duration_us", "nv_inference_queue_duration_us", "nv_inference_compute_input_duration_us", "nv_inference_compute_infer_duration_us", "nv_inference_compute_output_duration_us" ] gpu_data_metric_names = [ "nv_gpu_utilization", "nv_gpu_power_usage", "nv_gpu_power_limit", "nv_energy_consumption", "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" ] for k, v in model_table.items(): data = [] data.append(k) for data_metric in model_data_metric_names: data.append(v[data_metric]) model_data_list.append(data) for k, v in gpu_table.items(): data = [] data.append(k) for data_metric in gpu_data_metric_names: data.append(v[data_metric]) gpu_data_list.append(data) model_data = '\n'.join([ "" + '\n'.join(["" + item + "" for item in data]) + "" for data in model_data_list ]) gpu_data = '\n'.join([ "" + '\n'.join(["" + item + "" for item in data]) + "" for data in gpu_data_list ]) if lang == 'en': return metrics_table_head_en.format(model_data, gpu_data) return metrics_table_head.format(model_data, gpu_data) class HttpClientManager: def __init__(self): self.clients = {} # server url: httpclient def _create_client(self, server_url): if server_url in self.clients: return self.clients[server_url] try: fastdeploy_client = httpclient.InferenceServerClient(server_url) self.clients[server_url] = fastdeploy_client return fastdeploy_client except Exception: raise RuntimeError( 'Can not connect to server {}, please check your ' 'server address'.format(server_url)) def infer(self, server_url, model_name, model_version, inputs): fastdeploy_client = self._create_client(server_url) input_metadata, output_metadata = self.get_model_meta( server_url, model_name, model_version) inputs, outputs = prepare_request(input_metadata, inputs, output_metadata) response = fastdeploy_client.infer( model_name, inputs, model_version=model_version, outputs=outputs) results = {} for output in output_metadata: result = response.as_numpy(output.name) # datatype: numpy if output.datatype == 'BYTES': # datatype: bytes try: value = result if len(result.shape) == 1: value = result[0] elif len(result.shape) == 2: value = result[0][0] elif len(result.shape) == 3: value = result[0][0][0] result = json.loads(value) # datatype: json except Exception: pass else: result = result[0] results[output.name] = result return results def raw_infer(self, server_url, model_name, model_version, raw_input): url = 'http://{}/v2/models/{}/versions/{}/infer'.format( server_url, model_name, model_version) res = requests.post(url, data=json.dumps(json.loads(raw_input))) return json.dumps(res.json()) def get_model_meta(self, server_url, model_name, model_version): fastdeploy_client = self._create_client(server_url) try: model_metadata = fastdeploy_client.get_model_metadata( model_name=model_name, model_version=model_version) except Exception as e: raise RuntimeError("Failed to retrieve the metadata: " + str(e)) model_metadata = convert_http_metadata_config(model_metadata) input_metadata = model_metadata.inputs output_metadata = model_metadata.outputs return input_metadata, output_metadata