Add fastdeploy server and client component (#1169)

* add backend support for fastdeploy server * fix * add code * fix * fix * add fastdeploy server component * add fastdeploy server and client * add exception description * fix * add model repository judgement * add component tab for fastdeploy client * update more tasks in fastdeploy client * sort filenames * backup config * noqa for autogenerated file * add data validation * add __init__ for package * add calculating layout for frontend * add alive server detection and optimize client * add alive server detection and optimize client * add alive server detection and optimize client * add metrics in gradio client * update presentation * Change return value to None for frontend performance data when server not ready * add get_server_config and download_pretrain_model api * add get_server_config and download_pretrain_model api * add unit for metric table * add unit for metric table * fix a bug * add judgement pretrained model download * add judgement pretrained model download * add version info for frontend * rename download model * fix a bug * add fastdeploy model list * optimize for choose configuration files * modify according to frontend need * fix name in config to model name * optimize for server list and alive judgement * keep server name as string type * optimize process judgement logic * optimize for deleting resource files * add rename resource file * fix * fix a bug * optimize code structure * optimize code structure * remove chinese tips and remove fastdeploy-python in requirements

Add fastdeploy server and client component (#1169)
* add backend support for fastdeploy server * fix * add code * fix * fix * add fastdeploy server component * add fastdeploy server and client * add exception description * fix * add model repository judgement * add component tab for fastdeploy client * update more tasks in fastdeploy client * sort filenames * backup config * noqa for autogenerated file * add data validation * add __init__ for package * add calculating layout for frontend * add alive server detection and optimize client * add alive server detection and optimize client * add alive server detection and optimize client * add metrics in gradio client * update presentation * Change return value to None for frontend performance data when server not ready * add get_server_config and download_pretrain_model api * add get_server_config and download_pretrain_model api * add unit for metric table * add unit for metric table * fix a bug * add judgement pretrained model download * add judgement pretrained model download * add version info for frontend * rename download model * fix a bug * add fastdeploy model list * optimize for choose configuration files * modify according to frontend need * fix name in config to model name * optimize for server list and alive judgement * keep server name as string type * optimize process judgement logic * optimize for deleting resource files * add rename resource file * fix * fix a bug * optimize code structure * optimize code structure * remove chinese tips and remove fastdeploy-python in requirements
a418dd44 · chenjian · GitHub · b90619b9 · a418dd44 · a418dd44
14 changed file
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,8 @@ multiprocess
 packaging
 x2paddle
 rarfile
-onnx >= 1.6.0
+gradio
\ No newline at end of file
+tritonclient[all]
+attrdict
+psutil
+onnx >= 1.6.0
--- a/visualdl/component/inference/fastdeploy_client/__init__.py
+++ b/visualdl/component/inference/fastdeploy_client/__init__.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
--- a/visualdl/component/inference/fastdeploy_client/client_app.py
+++ b/visualdl/component/inference/fastdeploy_client/client_app.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import gradio as gr
+import numpy as np
+from .http_client_manager import get_metric_data
+from .http_client_manager import HttpClientManager
+from .http_client_manager import metrics_table_head
+from .visualizer import visualize_detection
+from .visualizer import visualize_face_alignment
+from .visualizer import visualize_face_detection
+from .visualizer import visualize_headpose
+from .visualizer import visualize_keypoint_detection
+from .visualizer import visualize_matting
+from .visualizer import visualize_ocr
+from .visualizer import visualize_segmentation
+_http_manager = HttpClientManager()
+supported_tasks = {
+    'detection': visualize_detection,
+    'facedet': visualize_face_detection,
+    'keypointdetection': visualize_keypoint_detection,
+    'segmentation': visualize_segmentation,
+    'matting': visualize_matting,
+    'ocr': visualize_ocr,
+    'facealignment': visualize_face_alignment,
+    'headpose': visualize_headpose,
+    'unspecified': lambda x: str(x)
+}
+def create_gradio_client_app():  # noqa:C901
+    css = """
+          .gradio-container {
+              font-family: 'IBM Plex Sans', sans-serif;
+          }
+          .gr-button {
+              color: white;
+              border-color: black;
+              background: black;
+          }
+          input[type='range'] {
+              accent-color: black;
+          }
+          .dark input[type='range'] {
+              accent-color: #dfdfdf;
+          }
+          #gallery {
+              min-height: 22rem;
+              margin-bottom: 15px;
+              margin-left: auto;
+              margin-right: auto;
+              border-bottom-right-radius: .5rem !important;
+              border-bottom-left-radius: .5rem !important;
+          }
+          #gallery>div>.h-full {
+              min-height: 20rem;
+          }
+          .details:hover {
+              text-decoration: underline;
+          }
+          .gr-button {
+              white-space: nowrap;
+          }
+          .gr-button:focus {
+              border-color: rgb(147 197 253 / var(--tw-border-opacity));
+              outline: none;
+              box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
+              --tw-border-opacity: 1;
+              --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) \
+                var(--tw-ring-offset-color);
+              --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
+              --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
+              --tw-ring-opacity: .5;
+          }
+          .footer {
+              margin-bottom: 45px;
+              margin-top: 35px;
+              text-align: center;
+              border-bottom: 1px solid #e5e5e5;
+          }
+          .footer>p {
+              font-size: .8rem;
+              display: inline-block;
+              padding: 0 10px;
+              transform: translateY(10px);
+              background: white;
+          }
+          .dark .footer {
+              border-color: #303030;
+          }
+          .dark .footer>p {
+              background: #0b0f19;
+          }
+          .prompt h4{
+              margin: 1.25em 0 .25em 0;
+              font-weight: bold;
+              font-size: 115%;
+          }
+  """
+    block = gr.Blocks(css=css)
+    with block:
+        gr.HTML("""
+              <div style="text-align: center; max-width: 650px; margin: 0 auto;">
+                <div
+                  style="
+                    display: inline-flex;
+                    gap: 0.8rem;
+                    font-size: 1.75rem;
+                    justify-content: center;
+                  "
+                >
+                <h1>
+                FastDeploy Client
+                </h1>
+                </div>
+                <p font-size: 94%">
+                The client is used for creating requests to fastdeploy server.
+                </p>
+              </div>
+          """)
+        with gr.Group():
+            with gr.Box():
+                with gr.Column():
+                    with gr.Row():
+                        server_addr_text = gr.Textbox(
+                            label="服务ip",
+                            show_label=True,
+                            max_lines=1,
+                            placeholder="localhost",
+                        )
+                        server_http_port_text = gr.Textbox(
+                            label="推理服务端口",
+                            show_label=True,
+                            max_lines=1,
+                            placeholder="8000",
+                        )
+                        server_metric_port_text = gr.Textbox(
+                            label="性能服务端口",
+                            show_label=True,
+                            max_lines=1,
+                            placeholder="8002",
+                        )
+                    with gr.Row():
+                        model_name_text = gr.Textbox(
+                            label="模型名称",
+                            show_label=True,
+                            max_lines=1,
+                            placeholder="yolov5",
+                        )
+                        model_version_text = gr.Textbox(
+                            label="模型版本",
+                            show_label=True,
+                            max_lines=1,
+                            placeholder="1",
+                        )
+            with gr.Box():
+                with gr.Tab("组件形式"):
+                    check_button = gr.Button("获取模型输入输出")
+                    component_format_column = gr.Column(visible=False)
+                    with component_format_column:
+                        task_radio = gr.Radio(
+                            choices=list(supported_tasks.keys()),
+                            value='unspecified',
+                            label='任务类型',
+                            visible=True)
+                        gr.Markdown("根据模型需要，挑选文本框或者图像框进行输入")
+                        with gr.Row():
+                            with gr.Column():
+                                gr.Markdown("模型输入")
+                                input_accordions = []
+                                input_name_texts = []
+                                input_images = []
+                                input_texts = []
+                                for i in range(6):
+                                    accordion = gr.Accordion(
+                                        "输入变量 {}".format(i),
+                                        open=True,
+                                        visible=False)
+                                    with accordion:
+                                        input_name_text = gr.Textbox(
+                                            label="变量名", interactive=False)
+                                        input_image = gr.Image(type='numpy')
+                                        input_text = gr.Textbox(
+                                            label="文本框", max_lines=1000)
+                                    input_accordions.append(accordion)
+                                    input_name_texts.append(input_name_text)
+                                    input_images.append(input_image)
+                                    input_texts.append(input_text)
+                            with gr.Column():
+                                gr.Markdown("模型输出")
+                                output_accordions = []
+                                output_name_texts = []
+                                output_images = []
+                                output_texts = []
+                                for i in range(6):
+                                    accordion = gr.Accordion(
+                                        "输出变量 {}".format(i),
+                                        open=True,
+                                        visible=False)
+                                    with accordion:
+                                        output_name_text = gr.Textbox(
+                                            label="变量名", interactive=False)
+                                        output_text = gr.Textbox(
+                                            label="服务返回的原数据",
+                                            interactive=False,
+                                            show_label=True)
+                                        output_image = gr.Image(
+                                            interactive=False)
+                                    output_accordions.append(accordion)
+                                    output_name_texts.append(output_name_text)
+                                    output_images.append(output_image)
+                                    output_texts.append(output_text)
+                        component_submit_button = gr.Button("提交请求")
+                with gr.Tab("原始形式"):
+                    gr.Markdown("模型输入")
+                    raw_payload_text = gr.Textbox(
+                        label="负载数据", max_lines=10000)
+                    with gr.Column():
+                        gr.Markdown("输出")
+                        output_raw_text = gr.Textbox(
+                            label="服务返回的原始数据", interactive=False)
+                    raw_submit_button = gr.Button("提交请求")
+            with gr.Box():
+                with gr.Column():
+                    gr.Markdown("服务性能统计（每次提交请求会自动更新数据，您也可以手动点击更新）")
+                    output_html_table = gr.HTML(
+                        label="metrics",
+                        interactive=False,
+                        show_label=False,
+                        value=metrics_table_head.format('', ''))
+                    update_metric_button = gr.Button("更新统计数据")
+            status_text = gr.Textbox(
+                label="status",
+                show_label=True,
+                max_lines=1,
+                interactive=False)
+        all_input_output_components = input_accordions + input_name_texts + input_images + \
+            input_texts + output_accordions + output_name_texts + output_images + output_texts
+        def get_input_output_name(server_ip, server_port, model_name,
+                                  model_version):
+            try:
+                server_addr = server_ip + ':' + server_port
+                input_metas, output_metas = _http_manager.get_model_meta(
+                    server_addr, model_name, model_version)
+            except Exception as e:
+                return {status_text: str(e)}
+            results = {
+                component: None
+                for component in all_input_output_components
+            }
+            results[component_format_column] = gr.update(visible=True)
+            # results[check_button] = gr.update(visible=False)
+            for input_accordio in input_accordions:
+                results[input_accordio] = gr.update(visible=False)
+            for output_accordio in output_accordions:
+                results[output_accordio] = gr.update(visible=False)
+            results[status_text] = 'GetInputOutputName Successful'
+            for i, input_meta in enumerate(input_metas):
+                results[input_accordions[i]] = gr.update(visible=True)
+                results[input_name_texts[i]] = input_meta['name']
+            for i, output_meta in enumerate(output_metas):
+                results[output_accordions[i]] = gr.update(visible=True)
+                results[output_name_texts[i]] = output_meta['name']
+            return results
+        def component_inference(*args):
+            server_ip = args[0]
+            http_port = args[1]
+            metric_port = args[2]
+            model_name = args[3]
+            model_version = args[4]
+            names = args[5:5 + len(input_name_texts)]
+            images = args[5 + len(input_name_texts):5 + len(input_name_texts) +
+                          len(input_images)]
+            texts = args[5 + len(input_name_texts) + len(input_images):5 +
+                         len(input_name_texts) + len(input_images) +
+                         len(input_texts)]
+            task_type = args[-1]
+            server_addr = server_ip + ':' + http_port
+            if server_ip and http_port and model_name and model_version:
+                inputs = {}
+                for i, input_name in enumerate(names):
+                    if input_name:
+                        if images[i] is not None:
+                            inputs[input_name] = np.array([images[i]])
+                        if texts[i]:
+                            inputs[input_name] = np.array(
+                                [[texts[i].encode('utf-8')]], dtype=np.object_)
+                try:
+                    infer_results = _http_manager.infer(
+                        server_addr, model_name, model_version, inputs)
+                    results = {status_text: 'Inference Successful'}
+                    for i, (output_name,
+                            data) in enumerate(infer_results.items()):
+                        results[output_name_texts[i]] = output_name
+                        results[output_texts[i]] = str(data)
+                        if task_type != 'unspecified':
+                            try:
+                                results[output_images[i]] = supported_tasks[
+                                    task_type](images[0], data)
+                            except Exception:
+                                results[output_images[i]] = None
+                    if metric_port:
+                        html_table = get_metric_data(server_ip, metric_port)
+                        results[output_html_table] = html_table
+                    return results
+                except Exception as e:
+                    return {status_text: 'Error: {}'.format(e)}
+            else:
+                return {
+                    status_text:
+                    'Please input server addr, model name and model version.'
+                }
+        def raw_inference(*args):
+            server_ip = args[0]
+            http_port = args[1]
+            metric_port = args[2]
+            model_name = args[3]
+            model_version = args[4]
+            payload_text = args[5]
+            server_addr = server_ip + ':' + http_port
+            try:
+                result = _http_manager.raw_infer(server_addr, model_name,
+                                                 model_version, payload_text)
+                results = {
+                    status_text: 'Get response from server',
+                    output_raw_text: result
+                }
+                if server_ip and metric_port:
+                    html_table = get_metric_data(server_ip, metric_port)
+                    results[output_html_table] = html_table
+                return results
+            except Exception as e:
+                return {status_text: 'Error: {}'.format(e)}
+        def update_metric(server_ip, metrics_port):
+            if server_ip and metrics_port:
+                try:
+                    html_table = get_metric_data(server_ip, metrics_port)
+                    return {
+                        output_html_table: html_table,
+                        status_text: "Successfully update metrics."
+                    }
+                except Exception as e:
+                    return {status_text: 'Error: {}'.format(e)}
+            else:
+                return {
+                    status_text: 'Please input server ip and metrics_port.'
+                }
+        check_button.click(
+            fn=get_input_output_name,
+            inputs=[
+                server_addr_text, server_http_port_text, model_name_text,
+                model_version_text
+            ],
+            outputs=[
+                *all_input_output_components, check_button,
+                component_format_column, status_text
+            ])
+        component_submit_button.click(
+            fn=component_inference,
+            inputs=[
+                server_addr_text, server_http_port_text,
+                server_metric_port_text, model_name_text, model_version_text,
+                *input_name_texts, *input_images, *input_texts, task_radio
+            ],
+            outputs=[
+                *output_name_texts, *output_images, *output_texts, status_text,
+                output_html_table
+            ])
+        raw_submit_button.click(
+            fn=raw_inference,
+            inputs=[
+                server_addr_text, server_http_port_text,
+                server_metric_port_text, model_name_text, model_version_text,
+                raw_payload_text
+            ],
+            outputs=[output_raw_text, status_text, output_html_table])
+        update_metric_button.click(
+            fn=update_metric,
+            inputs=[server_addr_text, server_metric_port_text],
+            outputs=[output_html_table, status_text])
+    return block
--- a/visualdl/component/inference/fastdeploy_client/http_client_manager.py
+++ b/visualdl/component/inference/fastdeploy_client/http_client_manager.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import json
+import re
+import numpy as np
+import requests
+import tritonclient.http as httpclient
+from attrdict import AttrDict
+from tritonclient.utils import InferenceServerException
+def convert_http_metadata_config(metadata):
+    metadata = AttrDict(metadata)
+    return metadata
+def prepare_request(inputs_meta, inputs_data, outputs_meta):
+    '''
+    inputs_meta: inputs meta information from model. name: info
+    inputs_data: users input data. name: data
+    '''
+    # Set the input data
+    inputs = []
+    for input_dict in inputs_meta:
+        input_name = input_dict['name']
+        if input_name not in inputs_data:
+            raise RuntimeError(
+                'Error: input name {} required for model not existed.'.format(
+                    input_name))
+        if input_dict['datatype'] == 'FP32':
+            inputs_data[input_name] = inputs_data[input_name].astype(
+                np.float32
+            ) / 255  # image data returned by gradio is uint8, convert to fp32
+        if len(input_dict['shape']
+               ) == 3 and input_dict['shape'][0] == 3:  # NCHW
+            inputs_data[input_name] = inputs_data[input_name][0].transpose(
+                2, 0, 1)
+        elif len(input_dict['shape']
+                 ) == 4 and input_dict['shape'][1] == 3:  # NCHW
+            inputs_data[input_name] = inputs_data[input_name].transpose(
+                0, 3, 1, 2)
+        infer_input = httpclient.InferInput(
+            input_name, inputs_data[input_name].shape, input_dict['datatype'])
+        infer_input.set_data_from_numpy(inputs_data[input_name])
+        inputs.append(infer_input)
+    outputs = []
+    for output_dict in outputs_meta:
+        infer_output = httpclient.InferRequestedOutput(output_dict.name)
+        outputs.append(infer_output)
+    return inputs, outputs
+metrics_table_head = """
+<style>
+table, th {{
+  border:0.1px solid black;
+}}
+</style>
+<div>
+<table style="width:100%">
+  <tr>
+    <th rowspan="2">模型名称</th>
+    <th colspan="4">执行统计</th>
+    <th colspan="5">延迟统计</th>
+  </tr>
+  <tr>
+   <th>请求处理成功数</th>
+  <th>请求处理失败数</th>
+  <th>推理batch数</th>
+  <th>推理样本数</th>
+  <th>请求处理时间(ms)</th>
+  <th>任务队列等待时间(ms)</th>
+  <th>输入处理时间(ms)</th>
+  <th>模型推理时间(ms)</th>
+  <th>输出处理时间(ms)</th>
+  </tr>
+  {}
+</table>
+</div>
+<br>
+<br>
+<br>
+<br>
+<br>
+<div>
+<table style="width:100%">
+  <tr>
+    <th rowspan="2">GPU</th>
+    <th colspan="4">性能指标</th>
+    <th colspan="2">显存</th>
+  </tr>
+  <tr>
+   <th>利用率(%)</th>
+  <th>功率(W)</th>
+  <th>功率限制(W)</th>
+  <th>耗电量(W)</th>
+  <th>总量(GB)</th>
+  <th>已使用(GB)</th>
+  </tr>
+  {}
+</table>
+</div>
+"""
+def get_metric_data(server_addr, metric_port):  # noqa:C901
+    '''
+    Get metrics data from fastdeploy server, and transform it into html table.
+    Args:
+        server_addr(str): fastdeployserver ip address
+        metric_port(int): fastdeployserver metrics port
+    Returns:
+        htmltable(str): html table to show metrics data
+    '''
+    model_table = {}
+    gpu_table = {}
+    metric_column_name = {
+        "Model": {
+            "nv_inference_request_success", "nv_inference_request_failure",
+            "nv_inference_count", "nv_inference_exec_count",
+            "nv_inference_request_duration_us",
+            "nv_inference_queue_duration_us",
+            "nv_inference_compute_input_duration_us",
+            "nv_inference_compute_infer_duration_us",
+            "nv_inference_compute_output_duration_us"
+        },
+        "GPU": {
+            "nv_gpu_power_usage", "nv_gpu_power_limit",
+            "nv_energy_consumption", "nv_gpu_utilization",
+            "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes"
+        },
+        "CPU": {
+            "nv_cpu_utilization", "nv_cpu_memory_total_bytes",
+            "nv_cpu_memory_used_bytes"
+        }
+    }
+    try:
+        res = requests.get("http://{}:{}/metrics".format(
+            server_addr, metric_port))
+    except Exception:
+        return metrics_table_head.format('', '')
+    metric_content = res.text
+    for content in metric_content.split('\n'):
+        if content.startswith('#'):
+            continue
+        else:
+            res = re.match(r'(\w+){(.*)} (\w+)',
+                           content)  # match output by server metrics interface
+            if not res:
+                continue
+            metric_name = res.group(1)
+            model = res.group(2)
+            value = res.group(3)
+            infos = {}
+            for info in model.split(','):
+                k, v = info.split('=')
+                v = v.strip('"')
+                infos[k] = v
+            if metric_name in [
+                    "nv_inference_request_duration_us",
+                    "nv_inference_queue_duration_us",
+                    "nv_inference_compute_input_duration_us",
+                    "nv_inference_compute_infer_duration_us",
+                    "nv_inference_compute_output_duration_us"
+            ]:
+                value = str(float(value) / 1000)
+            elif metric_name in [
+                    "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes"
+            ]:
+                value = str(float(value) / 1024 / 1024 / 1024)
+            for key, metric_names in metric_column_name.items():
+                if metric_name in metric_names:
+                    if key == 'Model':
+                        model_name = infos['model']
+                        if model_name not in model_table:
+                            model_table[model_name] = {}
+                        model_table[model_name][metric_name] = value
+                    elif key == 'GPU':
+                        gpu_name = infos['gpu_uuid']
+                        if gpu_name not in gpu_table:
+                            gpu_table[gpu_name] = {}
+                        gpu_table[gpu_name][metric_name] = value
+                    elif key == 'CPU':
+                        pass
+    model_data_list = []
+    gpu_data_list = []
+    model_data_metric_names = [
+        "nv_inference_request_success", "nv_inference_request_failure",
+        "nv_inference_exec_count", "nv_inference_count",
+        "nv_inference_request_duration_us", "nv_inference_queue_duration_us",
+        "nv_inference_compute_input_duration_us",
+        "nv_inference_compute_infer_duration_us",
+        "nv_inference_compute_output_duration_us"
+    ]
+    gpu_data_metric_names = [
+        "nv_gpu_utilization", "nv_gpu_power_usage", "nv_gpu_power_limit",
+        "nv_energy_consumption", "nv_gpu_memory_total_bytes",
+        "nv_gpu_memory_used_bytes"
+    ]
+    for k, v in model_table.items():
+        data = []
+        data.append(k)
+        for data_metric in model_data_metric_names:
+            data.append(v[data_metric])
+        model_data_list.append(data)
+    for k, v in gpu_table.items():
+        data = []
+        data.append(k)
+        for data_metric in gpu_data_metric_names:
+            data.append(v[data_metric])
+        gpu_data_list.append(data)
+    model_data = '\n'.join([
+        "<tr>" + '\n'.join(["<td>" + item + "</td>"
+                            for item in data]) + "</tr>"
+        for data in model_data_list
+    ])
+    gpu_data = '\n'.join([
+        "<tr>" + '\n'.join(["<td>" + item + "</td>"
+                            for item in data]) + "</tr>"
+        for data in gpu_data_list
+    ])
+    return metrics_table_head.format(model_data, gpu_data)
+class HttpClientManager:
+    def __init__(self):
+        self.clients = {}  # server url: httpclient
+    def _create_client(self, server_url):
+        if server_url in self.clients:
+            return self.clients[server_url]
+        try:
+            fastdeploy_client = httpclient.InferenceServerClient(server_url)
+            self.clients[server_url] = fastdeploy_client
+            return fastdeploy_client
+        except Exception:
+            raise RuntimeError(
+                'Can not connect to server {}, please check your \
+                    server address'.format(server_url))
+    def infer(self, server_url, model_name, model_version, inputs):
+        fastdeploy_client = self._create_client(server_url)
+        input_metadata, output_metadata = self.get_model_meta(
+            server_url, model_name, model_version)
+        inputs, outputs = prepare_request(input_metadata, inputs,
+                                          output_metadata)
+        response = fastdeploy_client.infer(
+            model_name, inputs, model_version=model_version, outputs=outputs)
+        results = {}
+        for output in output_metadata:
+            result = response.as_numpy(output.name)  # datatype: numpy
+            if output.datatype == 'BYTES':  # datatype: bytes
+                try:
+                    value = result
+                    if len(result.shape) == 1:
+                        value = result[0]
+                    elif len(result.shape) == 2:
+                        value = result[0][0]
+                    elif len(result.shape) == 3:
+                        value = result[0][0][0]
+                    result = json.loads(value)  # datatype: json
+                except Exception:
+                    pass
+            else:
+                result = result[0]
+            results[output.name] = result
+        return results
+    def raw_infer(self, server_url, model_name, model_version, raw_input):
+        url = 'http://{}/v2/models/{}/versions/{}/infer'.format(
+            server_url, model_name, model_version)
+        res = requests.post(url, data=json.dumps(json.loads(raw_input)))
+        return json.dumps(res.json())
+    def get_model_meta(self, server_url, model_name, model_version):
+        fastdeploy_client = self._create_client(server_url)
+        try:
+            model_metadata = fastdeploy_client.get_model_metadata(
+                model_name=model_name, model_version=model_version)
+        except InferenceServerException as e:
+            raise RuntimeError("Failed to retrieve the metadata: " + str(e))
+        model_metadata = convert_http_metadata_config(model_metadata)
+        input_metadata = model_metadata.inputs
+        output_metadata = model_metadata.outputs
+        return input_metadata, output_metadata
--- a/visualdl/component/inference/fastdeploy_client/visualizer.py
+++ b/visualdl/component/inference/fastdeploy_client/visualizer.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import numpy as np
+__all__ = [
+    'visualize_detection', 'visualize_keypoint_detection',
+    'visualize_face_detection', 'visualize_face_alignment',
+    'visualize_segmentation', 'visualize_matting', 'visualize_ocr',
+    'visualize_headpose'
+]
+def visualize_detection(image, data):
+    try:
+        import fastdeploy as fd
+    except Exception:
+        raise RuntimeError(
+            "fastdeploy is required for visualizing results，please refer to \
+        https://github.com/PaddlePaddle/FastDeploy to install fastdeploy")
+    boxes = np.array(data['boxes'])
+    scores = np.array(data['scores'])
+    label_ids = np.array(data['label_ids'])
+    masks = np.array(data['masks'])
+    contain_masks = data['contain_masks']
+    detection_result = fd.C.vision.DetectionResult()
+    detection_result.boxes = boxes
+    detection_result.scores = scores
+    detection_result.label_ids = label_ids
+    detection_result.masks = masks
+    detection_result.contain_masks = contain_masks
+    result = fd.vision.vis_detection(image, detection_result)
+    return result
+def visualize_keypoint_detection(image, data):
+    try:
+        import fastdeploy as fd
+    except Exception:
+        raise RuntimeError(
+            "fastdeploy is required for visualizing results，please refer to \
+        https://github.com/PaddlePaddle/FastDeploy to install fastdeploy")
+    keypoints = np.array(data['keypoints'])
+    scores = np.array(data['scores'])
+    num_joints = np.array(data['num_joints'])
+    detection_result = fd.C.vision.KeyPointDetectionResult()
+    detection_result.keypoints = keypoints
+    detection_result.scores = scores
+    detection_result.num_joints = num_joints
+    result = fd.vision.vis_keypoint_detection(image, detection_result)
+    return result
+def visualize_face_detection(image, data):
+    try:
+        import fastdeploy as fd
+    except Exception:
+        raise RuntimeError(
+            "fastdeploy is required for visualizing results，please refer to \
+        https://github.com/PaddlePaddle/FastDeploy to install fastdeploy")
+    data = np.array(data['data'])
+    scores = np.array(data['scores'])
+    landmarks = np.array(data['landmarks'])
+    landmarks_per_face = data['landmarks_per_face']
+    detection_result = fd.C.vision.FaceDetectionResult()
+    detection_result.data = data
+    detection_result.scores = scores
+    detection_result.landmarks = landmarks
+    detection_result.landmarks_per_face = landmarks_per_face
+    result = fd.vision.vis_face_detection(image, detection_result)
+    return result
+def visualize_face_alignment(image, data):
+    try:
+        import fastdeploy as fd
+    except Exception:
+        raise RuntimeError(
+            "fastdeploy is required for visualizing results，please refer to \
+        https://github.com/PaddlePaddle/FastDeploy to install fastdeploy")
+    landmarks = np.array(data['landmarks'])
+    facealignment_result = fd.C.vision.FaceAlignmentResult()
+    facealignment_result.landmarks = landmarks
+    result = fd.vision.vis_face_alignment(image, facealignment_result)
+    return result
+def visualize_segmentation(image, data):
+    try:
+        import fastdeploy as fd
+    except Exception:
+        raise RuntimeError(
+            "fastdeploy is required for visualizing results，please refer to \
+        https://github.com/PaddlePaddle/FastDeploy to install fastdeploy")
+    label_ids = np.array(data['label_ids'])
+    score_map = np.array(data['score_map'])
+    shape = np.array(data['shape'])
+    segmentation_result = fd.C.vision.SegmentationResult()
+    segmentation_result.shape = shape
+    segmentation_result.score_map = score_map
+    segmentation_result.label_ids = label_ids
+    result = fd.vision.vis_segmentation(image, segmentation_result)
+    return result
+def visualize_matting(image, data):
+    try:
+        import fastdeploy as fd
+    except Exception:
+        raise RuntimeError(
+            "fastdeploy is required for visualizing results，please refer to \
+        https://github.com/PaddlePaddle/FastDeploy to install fastdeploy")
+    alpha = np.array(data['alpha'])
+    foreground = np.array(data['foreground'])
+    contain_foreground = data['contain_foreground']
+    shape = np.array(data['shape'])
+    matting_result = fd.C.vision.MattingResult()
+    matting_result.alpha = alpha
+    matting_result.foreground = foreground
+    matting_result.contain_foreground = contain_foreground
+    matting_result.shape = shape
+    result = fd.vision.vis_matting(image, matting_result)
+    return result
+def visualize_ocr(image, data):
+    try:
+        import fastdeploy as fd
+    except Exception:
+        raise RuntimeError(
+            "fastdeploy is required for visualizing results，please refer to \
+        https://github.com/PaddlePaddle/FastDeploy to install fastdeploy")
+    boxes = np.array(data['boxes'])
+    text = np.array(data['text'])
+    rec_scores = np.array(data['rec_scores'])
+    cls_scores = np.array(data['cls_scores'])
+    cls_labels = data['cls_labels']
+    ocr_result = fd.C.vision.OCRResult()
+    ocr_result.boxes = boxes
+    ocr_result.text = text
+    ocr_result.rec_scores = rec_scores
+    ocr_result.cls_scores = cls_scores
+    ocr_result.cls_labels = cls_labels
+    result = fd.vision.vis_ppocr(image, ocr_result)
+    return result
+def visualize_headpose(image, data):
+    try:
+        import fastdeploy as fd
+    except Exception:
+        raise RuntimeError(
+            "fastdeploy is required for visualizing results，please refer to \
+        https://github.com/PaddlePaddle/FastDeploy to install fastdeploy")
+    euler_angles = np.array(data['euler_angles'])
+    headpose_result = fd.C.vision.HeadPoseResult()
+    headpose_result.euler_angles = euler_angles
+    result = fd.vision.vis_headpose(image, headpose_result)
+    return result
--- a/visualdl/component/inference/fastdeploy_lib.py
+++ b/visualdl/component/inference/fastdeploy_lib.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import copy
+import json
+import os
+import random
+import re
+import signal
+import string
+from collections import defaultdict
+from subprocess import Popen
+from subprocess import STDOUT
+import google.protobuf.json_format as json_format
+import google.protobuf.text_format as text_format
+import psutil
+import requests
+from .proto.model_config_pb2 import ModelConfig
+from visualdl.utils.dir import FASTDEPLOYSERVER_PATH
+def pbtxt2json(content: str):
+    '''
+  Convert protocol messages in text format to json format string.
+  '''
+    message = text_format.Parse(content, ModelConfig())
+    json_string = json_format.MessageToJson(message)
+    return json_string
+def json2pbtxt(content: str):
+    '''
+  Convert json format string to protocol messages in text format.
+  '''
+    message = json_format.Parse(content, ModelConfig())
+    text_proto = text_format.MessageToString(message)
+    return text_proto
+def validate_data(model_config):
+    '''
+    Validate data in model config, we should check empty value recieved from front end.
+    The easiest way to handle it is to drop empty value.
+    Args:
+        model_config: model config to be saved in config file
+    Return:
+        model config after filtering.
+    '''
+    model_config_filtered = {}
+    for key, value in model_config.items():
+        if value:
+            model_config_filtered[key] = value
+    return model_config_filtered
+def analyse_config(cur_dir: str):
+    '''
+  Analyse the model config in specified directory.
+  Return a json object to describe configuration.
+  '''
+    all_model_configs = {}
+    all_model_versions = {}
+    parent_dir, sub_dirs, filenames = os.walk(cur_dir).send(
+        None)  # models can only put directory in model repository,
+    # so we should only search depth 1 directories.
+    for model_dir_name in sub_dirs:
+        model_dir, model_sub_dirs, filenames = os.walk(
+            os.path.join(parent_dir, model_dir_name)).send(None)
+        model_name = os.path.basename(model_dir)
+        config_filenames = []
+        for filename in filenames:
+            if '.pbtxt' in filename:
+                config_filenames.append(
+                    filename
+                )  # filenames with extension .pbtxt are all config files
+        if config_filenames:
+            default_config_filename = config_filenames[0]
+            if 'config.pbtxt' in config_filenames:
+                default_config_filename = 'config.pbtxt'
+                config_filenames.remove(default_config_filename)
+                config_filenames.insert(0, default_config_filename)
+            else:
+                # if no config.pbtxt, we choose the first file in config_filenames list to create config.pbtxt
+                copy_config_file_to_default_config(model_dir,
+                                                   default_config_filename)
+                default_config_filename = 'config.pbtxt'
+                config_filenames.insert(0, default_config_filename)
+            json_config = json.loads(
+                pbtxt2json(
+                    open(os.path.join(model_dir,
+                                      default_config_filename)).read()))
+            json_config["config_filenames"] = config_filenames[
+                0]  # add config_filenames to config data (frontend developer said he only wanted one filename,
+            # and to request config_filenames by get_config_filenames_for_one_model later)
+            all_model_configs[
+                model_name] = json_config  # store original config file content in json format
+            json_config[
+                'name'] = model_name  # because name in config data may be different from model_name,
+            # model_name is model directory name actually, we should conform name with model_name.
+        else:
+            continue
+        for model_sub_dir in model_sub_dirs:
+            if re.match(
+                    r'\d+',
+                    model_sub_dir):  # version directory consists of numbers
+                if model_name not in all_model_versions:
+                    all_model_versions[model_name] = {}
+                if model_sub_dir not in all_model_versions[model_name]:
+                    all_model_versions[model_name][model_sub_dir] = []
+                for version_resource_file in os.listdir(
+                        os.path.join(model_dir, model_sub_dir)):
+                    all_model_versions[model_name][model_sub_dir].append(
+                        version_resource_file)
+        if model_name not in all_model_versions:  # if a model has config but no version directory,
+            # to convenient users, we create one
+            all_model_versions[model_name] = {}
+            os.mkdir(os.path.join(model_dir, '1'))
+            all_model_versions[model_name]['1'] = []
+    if not all_model_configs:
+        raise Exception(
+            'The path you choose is not a valid model repository, please choose a valid path.'
+        )
+    return all_model_configs, all_model_versions
+def exchange_format_to_original_format(exchange_format):
+    '''
+  Change config exchange format to original format.
+  '''
+    ensembles = []
+    models = []
+    all_models = {}
+    if 'ensembles' in exchange_format:
+        ensembles = exchange_format['ensembles']
+    if 'models' in exchange_format:
+        models = exchange_format['models']
+    alls = ensembles + models
+    for model_config in alls:
+        # 1. add 'executionAccelerators' keyword
+        if 'optimization' in model_config:
+            optimization_config = model_config['optimization']
+            del model_config['optimization']
+            model_config['optimization'] = {}
+            model_config['optimization'][
+                'executionAccelerators'] = optimization_config
+        # 2. delete versions information
+        if 'versions' in model_config:
+            del model_config['versions']
+        if 'config_filenames' in model_config:
+            del model_config['config_filenames']
+        if 'platform' in model_config and model_config[
+                'platform'] == 'ensemble':  # emsemble model
+            # 3. add 'ensembleScheduling' keyword
+            if 'step' in model_config:
+                step_configs = model_config['step']
+                if 'ensembleScheduling' not in model_config:
+                    model_config['ensembleScheduling'] = {}
+                model_config['ensembleScheduling']['step'] = step_configs
+                del model_config['step']
+                # 4. remove two virtual models(feed, fetch), and
+                #  "modelType", "inputModels", "outputModels", "inputVars", "outputVars"
+                remove_list = []
+                for model_config_in_step in step_configs:
+                    if model_config_in_step[
+                            'modelName'] == 'feed' or model_config_in_step[
+                                'modelName'] == 'fetch':
+                        remove_list.append(model_config_in_step)
+                        continue
+                    del model_config_in_step['modelType']
+                    del model_config_in_step['inputModels']
+                    del model_config_in_step['outputModels']
+                    del model_config_in_step['inputVars']
+                    del model_config_in_step['outputVars']
+                for remove_item in remove_list:
+                    step_configs.remove(remove_item)
+        all_models[model_config['name']] = model_config
+    return all_models
+def copy_config_file_to_default_config(model_dir, config_name):
+    json_config = json.loads(
+        pbtxt2json(open(os.path.join(model_dir, config_name)).read()))
+    model_name = os.path.basename(model_dir)
+    json_config['name'] = model_name
+    text_proto = json2pbtxt(json.dumps(json_config))
+    with open(os.path.join(model_dir, 'config.pbtxt'), 'w') as f:
+        f.write(text_proto)
+def original_format_to_exchange_format(original_format, version_info):
+    '''
+  Change config original format to exchange format.
+  '''
+    exchange_format = {}
+    exchange_format['ensembles'] = []
+    exchange_format['models'] = []
+    # 0. transform version info into component format in frontend
+    for model_name, version_filenames_dict in version_info.items():
+        version_info_for_frontend = []
+        for version_name, filenames in version_filenames_dict.items():
+            version_filenames_dict_for_frontend = {}
+            version_filenames_dict_for_frontend['title'] = version_name
+            version_filenames_dict_for_frontend['key'] = version_name
+            version_filenames_dict_for_frontend['children'] = []
+            for filename in filenames:
+                version_filenames_dict_for_frontend['children'].append({
+                    'title':
+                    filename,
+                    'key':
+                    filename
+                })
+            version_info_for_frontend.append(
+                version_filenames_dict_for_frontend)
+        version_info[model_name] = version_info_for_frontend
+    for model_name, model_config in original_format.items():
+        # 1. remove 'executionAccelerators' keyword
+        transformed_config = copy.deepcopy(model_config)
+        if 'optimization' in model_config:
+            if 'executionAccelerators' in model_config['optimization']:
+                transformed_optimization_config = model_config['optimization'][
+                    'executionAccelerators']
+                del transformed_config['optimization']
+                transformed_config[
+                    'optimization'] = transformed_optimization_config
+        # 2. add versions information
+        if model_name in version_info:
+            transformed_config['versions'] = version_info[model_name]
+        if 'platform' in model_config and model_config[
+                'platform'] == 'ensemble':  # emsemble model
+            # 3. remove ensembleScheduling
+            if 'ensembleScheduling' in model_config:
+                if 'step' in model_config['ensembleScheduling']:
+                    del transformed_config['ensembleScheduling']
+                    transformed_config['step'] = model_config[
+                        'ensembleScheduling']['step']
+                    # 4. add two virtual models(feed, fetch), and
+                    # "modelType", "inputModels", "outputModels", "inputVars", "outputVars"
+                    for model_config_in_step in transformed_config['step']:
+                        model_config_in_step['modelType'] = 'normal'
+                        model_config_in_step['inputModels'] = []
+                        model_config_in_step['outputModels'] = []
+                        model_config_in_step['inputVars'] = []
+                        model_config_in_step['outputVars'] = []
+                    transformed_config['step'].append({
+                        "modelName": "feed",
+                        "modelType": "virtual",
+                        "inputModels": [],
+                        "outputModels": [],
+                        "inputVars": [],
+                        "outputVars": []
+                    })
+                    transformed_config['step'].append({
+                        "modelName": "fetch",
+                        "modelType": "virtual",
+                        "inputModels": [],
+                        "outputModels": [],
+                        "inputVars": [],
+                        "outputVars": []
+                    })
+                    analyse_step_relationships(transformed_config['step'],
+                                               transformed_config['input'],
+                                               transformed_config['output'])
+                    exchange_format['ensembles'].append(transformed_config)
+        elif 'backend' in model_config:  # single model
+            exchange_format['models'].append(transformed_config)
+    return exchange_format
+def analyse_step_relationships(step_config, inputs, outputs):  # noqa: C901
+    '''
+  Analyse model relationships in ensemble step. And fill  \
+    "inputModels", "outputModels", "inputVars", "outputVars" in step_config.
+  step_config: step data in ensemble model config.
+  inputs: inputs in ensemble model config.
+  outputs: outputs in ensemble model config.
+  '''
+    models_dict = {}
+    vars_dict = {}
+    for model_config_in_step in step_config:
+        models_dict[model_config_in_step['modelName']] = model_config_in_step
+        if model_config_in_step['modelType'] == 'virtual':
+            for var in inputs:
+                if var['name'] not in vars_dict:
+                    vars_dict[var['name']] = {}
+                    vars_dict[var['name']]['from_models'] = set()
+                    vars_dict[var['name']]['to_models'] = set()
+                vars_dict[var['name']]['from_models'].add('feed')
+            for var in outputs:
+                if var['name'] not in vars_dict:
+                    vars_dict[var['name']] = {}
+                    vars_dict[var['name']]['from_models'] = set()
+                    vars_dict[var['name']]['to_models'] = set()
+                vars_dict[var['name']]['to_models'].add('fetch')
+        else:
+            for var_placehold_name, var_name in model_config_in_step[
+                    'inputMap'].items():
+                if var_name not in vars_dict:
+                    vars_dict[var_name] = {}
+                    vars_dict[var_name]['from_models'] = set()
+                    vars_dict[var_name]['to_models'] = set()
+                vars_dict[var_name]['to_models'].add(
+                    model_config_in_step['modelName'])
+            for var_placehold_name, var_name in model_config_in_step[
+                    'outputMap'].items():
+                if var_name not in vars_dict:
+                    vars_dict[var_name] = {}
+                    vars_dict[var_name]['from_models'] = set()
+                    vars_dict[var_name]['to_models'] = set()
+                vars_dict[var_name]['from_models'].add(
+                    model_config_in_step['modelName'])
+    for var_name, relationships in vars_dict.items():
+        for from_model in relationships['from_models']:
+            models_dict[from_model]['outputVars'].append(var_name)
+            for var_to_model in relationships['to_models']:
+                if var_to_model not in models_dict[from_model]['outputModels']:
+                    models_dict[from_model]['outputModels'].append(
+                        var_to_model)
+        for to_model in relationships['to_models']:
+            models_dict[to_model]['inputVars'].append(var_name)
+            for var_from_model in relationships['from_models']:
+                if var_from_model not in models_dict[to_model]['inputModels']:
+                    models_dict[to_model]['inputModels'].append(var_from_model)
+    calculate_layout_for_frontend(models_dict)
+def get_config_filenames_for_one_model(cur_dir, name):
+    _, _, filenames = os.walk(os.path.join(cur_dir, name)).send(None)
+    config_filenames = []
+    backup_config_filenames = []
+    for filename in filenames:
+        if '.pbtxt' in filename and 'vdlbackup' not in filename:
+            config_filenames.append(
+                filename
+            )  # filenames with extension .pbtxt and not contain 'vdlbackup' are normal config files
+        elif '.pbtxt' in filename and 'vdlbackup' in filename:
+            backup_config_filenames.append(
+                filename
+            )  # filenames with extension .pbtxt and  contain 'vdlbackup' are backup config files
+    config_filenames = sorted(config_filenames) + sorted(
+        backup_config_filenames)
+    return config_filenames
+def get_config_for_one_model(cur_dir, name, config_filename):
+    all_model_configs = {}
+    all_model_versions = {}
+    filename = os.path.join(cur_dir, name, config_filename)
+    json_config = json.loads(pbtxt2json(open(filename).read()))
+    json_config[
+        'name'] = name  # because name in config data may be different from model_name,
+    # model_name is model directory name actually, we should conform name with model_name.
+    json_config["config_filenames"] = config_filename
+    all_model_configs[
+        name] = json_config  # store original config file content in json format
+    all_model_versions[name] = {}
+    for model_sub_dir in os.listdir(os.path.join(cur_dir, name)):
+        if re.match(r'\d+',
+                    model_sub_dir):  # version directory consists of numbers
+            if model_sub_dir not in all_model_versions[name]:
+                all_model_versions[name][model_sub_dir] = []
+            for version_resource_file in os.listdir(
+                    os.path.join(cur_dir, name, model_sub_dir)):
+                all_model_versions[name][model_sub_dir].append(
+                    version_resource_file)
+    model_config = original_format_to_exchange_format(all_model_configs,
+                                                      all_model_versions)
+    if model_config['ensembles']:
+        return model_config['ensembles'][0]
+    elif model_config['models']:
+        return model_config['models'][0]
+def calculate_layout_for_frontend(model_config_in_step):
+    '''
+    Analyse model topology connections and prepare the positions for each model in layout.
+    Dynamic program algorithm:
+        depth(cur_node) = max([depth(prev_node) for prev_node in cur_node['inputModels']])
+    Args:
+        model_config_in_step(dict): model config in ensemble models' step, indexed by model name.
+    Returns:
+        None. Results calculated will be saved in place.
+    '''
+    path_depth = defaultdict(int)
+    def depth_recursive(model):
+        if model['modelName'] == 'feed':
+            path_depth[model['modelName']] = 0
+            return 0
+        if path_depth[model['modelName']] != 0:
+            return path_depth[model['modelName']]
+        path_depth[model['modelName']] = max([
+            depth_recursive(model_config_in_step[model_name]) for model_name in
+            model_config_in_step[model['modelName']]['inputModels']
+        ]) + 1
+        return path_depth[model['modelName']]
+    depth_recursive(model_config_in_step['fetch'])
+    path_depth_tuple = [
+        (k, v)
+        for k, v in sorted(path_depth.items(), key=lambda item: item[1])
+    ]
+    cur_x = 0
+    last_depth = -1
+    for model_name, depth in path_depth_tuple:
+        if depth == last_depth:
+            model_config_in_step[model_name]['pos_y'] = depth
+            model_config_in_step[model_name]['pos_x'] = cur_x
+            cur_x += 1
+        else:
+            cur_x = 0
+            model_config_in_step[model_name]['pos_y'] = depth
+            model_config_in_step[model_name]['pos_x'] = cur_x
+            cur_x += 1
+        last_depth = depth
+    return
+def launch_process(kwargs: dict):
+    '''
+  Launch a fastdeploy server according to specified arguments.
+  '''
+    cmd = ['fastdeployserver']
+    launch_env = os.environ.copy()
+    start_args = {}
+    for key, value in kwargs.items():
+        if key == 'default_model_name':  # Used to fill client model_name automatically
+            start_args[key] = value
+            continue
+        if key == 'server-name' or key == 'ensemble-img':  # extra information
+            start_args[key] = value
+            continue
+        if key == 'gpus':
+            if value:
+                launch_env['CUDA_VISIBLE_DEVICES'] = value
+                start_args[key] = value
+            continue
+        cmd.append('--{}'.format(key))
+        cmd.append('{}'.format(value))
+        start_args[key] = value
+    if start_args['server-name'] and start_args['server-name'] in os.listdir(
+            FASTDEPLOYSERVER_PATH):
+        raise RuntimeError(
+            "Failed to launch server，server name {} has been used，please write a different server name."
+            .format(start_args['server-name']))
+    all_model_configs, all_model_versions = analyse_config(
+        start_args['model-repository'])
+    model_repo_config = original_format_to_exchange_format(
+        all_model_configs, all_model_versions)
+    model_repo_config['ensemble-img'] = start_args['ensemble-img']
+    logfilename = 'logfile-{}'.format(get_random_string(8))
+    while os.path.exists(os.path.join(FASTDEPLOYSERVER_PATH, logfilename)):
+        logfilename = 'logfile-{}'.format(get_random_string(8))
+    p = Popen(
+        cmd,
+        stdout=open(
+            os.path.join(FASTDEPLOYSERVER_PATH, logfilename), 'w',
+            buffering=1),
+        stderr=STDOUT,
+        universal_newlines=True,
+        env=launch_env)
+    server_name = start_args['server-name'] if start_args[
+        'server-name'] else p.pid
+    with open(
+            os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_name)),
+            'w') as f:
+        # filename ${server_name} contain 4 lines:
+        # line1 : the real log filename ${logfilename}
+        # line2 : pid
+        # line3 : launch arguments
+        # line4 : model-repository configuration
+        f.write(logfilename + '\n' + str(p.pid) + '\n' +
+                json.dumps(start_args) + '\n' + json.dumps(model_repo_config))
+    return p
+def get_random_string(length):
+    # choose from all lowercase letter
+    letters = string.ascii_lowercase
+    result_str = ''.join([random.choice(letters) for i in range(length)])
+    return result_str
+def get_start_arguments(server_id):
+    '''
+    Get the start arguments for fastdeployserver process.
+    Args:
+        server_id(str): fastdeployserver process name
+    Returns:
+        args(dict): launch arguments when start fastdeployserver process.
+    '''
+    args = {}
+    if os.path.exists(
+            os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))):
+        with open(
+                os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)),
+                'r') as f:
+            arguments_json = f.read().split('\n')[2]
+            args = json.loads(arguments_json)
+    return args
+def get_process_pid(server_id):
+    '''
+    Get the process id for fastdeployserver process.
+    Args:
+        server_id(str): fastdeployserver process name
+    Returns:
+        pid(int): process id.
+    '''
+    pid = None
+    if os.path.exists(
+            os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))):
+        with open(
+                os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)),
+                'r') as f:
+            pid = int(f.read().split('\n')[1])
+    return pid
+def get_process_logfile_name(server_id):
+    '''
+    Get the process logfile name for fastdeployserver process.
+    Args:
+        server_id(str): fastdeployserver process name
+    Returns:
+        logfile(str): logfile name.
+    '''
+    filename = None
+    if os.path.exists(
+            os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))):
+        with open(
+                os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)),
+                'r') as f:
+            filename = f.read().split('\n')[0]
+    return filename
+def get_process_model_configuration(server_id):
+    '''
+    Get the model repository configuration for fastdeployserver process.
+    Args:
+        server_id(str): fastdeployserver process name
+    Returns:
+        configuration(dict): model repository configuration
+    '''
+    conf = {}
+    if os.path.exists(
+            os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))):
+        with open(
+                os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)),
+                'r') as f:
+            conf_json = f.read().split('\n')[3]
+            conf = json.loads(conf_json)
+    return conf
+def get_process_output(server_id, length):
+    '''
+  Get the standard output of a opened subprocess.
+  '''
+    if os.path.exists(
+            os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))):
+        logfilename = get_process_logfile_name(server_id)
+        # delete file ${logfilename} if exists
+        if os.path.exists(
+                os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(logfilename))):
+            with open(
+                    os.path.join(FASTDEPLOYSERVER_PATH,
+                                 '{}'.format(logfilename)), 'r') as f:
+                f.seek(length)
+                data = f.read()
+                return data
+def mark_pid_for_dead_process(server_id):
+    '''
+    Resource files for a dead server only deleted when user closes the server in frontend.
+    When user close the server, pid recorded in logfile will be killed.
+    In case a dead process id is reassigned for a new process, we should mark the pid recorded in logfile as outdated.
+    Here, we choose to replace the pid to -1 in logfile to denote the zombie process \
+        which has been polled and becomes dead.
+    Args:
+        server_id(str): fastdeployserver process name
+    '''
+    if os.path.exists(
+            os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))):
+        with open(
+                os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)),
+                'r') as f:
+            contents = f.read().split('\n')
+        contents[1] = '-1'  # we replace pid to -1
+        with open(
+                os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)),
+                'w') as f:
+            f.write('\n'.join(contents))
+def delete_files_for_process(server_id):
+    '''
+    Delete logfile for fastdeployserver process.
+    Args:
+        server_id(str): fastdeployserver process name
+    '''
+    if os.path.exists(
+            os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))):
+        logfilename = get_process_logfile_name(server_id)
+        # delete file ${logfilename} if exists
+        if os.path.exists(
+                os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(logfilename))):
+            os.remove(
+                os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(logfilename)))
+        os.remove(os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)))
+def kill_process(process):
+    '''
+  Stop a opened subprocess.
+  '''
+    if type(process) == str:  # server_id, use os.kill to terminate
+        pid = get_process_pid(process)
+        if pid == -1:  # we use -1 to mark dead process
+            return
+        try:
+            os.kill(pid, signal.SIGKILL)
+        except Exception:
+            pass
+    else:
+        pid = process.pid
+        process.kill()
+        try:
+            process.wait(10)
+        except Exception:
+            pass
+def get_alive_fastdeploy_servers():
+    '''
+    Search server names in `FASTDEPLOYSERVER_PATH`, if process is dead and log still exists due to \
+        some unexpectable reasons, delete log file.
+    '''
+    server_names = [
+        name for name in os.listdir(FASTDEPLOYSERVER_PATH)
+        if 'logfile' not in name
+    ]
+    should_delete_servers = []
+    for server_name in server_names:
+        if check_process_alive(server_name) is False:
+            delete_files_for_process(server_name)
+            should_delete_servers.append(server_name)
+    for server_name in should_delete_servers:
+        server_names.remove(server_name)
+    return server_names
+def check_process_zombie(server_id):
+    '''
+    Given a server id, check whether the process became zoombie and mark pid as -1.
+    Args:
+        server_id(str): fastdeployserver process name
+    Return:
+        status(bool): True if process became zoombie.
+    '''
+    pid = get_process_pid(server_id)
+    if pid == -1:
+        return True
+    else:
+        return False
+def check_process_alive(server_id):
+    '''
+    Given a server id, check whether the process is alive or not.
+    Args:
+        server_id(str): fastdeployserver process name
+    Return:
+        status(bool): True if process is still alive.
+    '''
+    pid = get_process_pid(server_id)
+    if pid is None:
+        return False
+    if pid == -1:  # We use -1 to mark zombie process which has been dead process.
+        # Consider user wants to know the reason for dead process  due to exception,
+        # we return True to let user in frontend can get the log for dead process.
+        return True
+    try:
+        os.kill(pid, 0)
+    except OSError:
+        return False
+    else:
+        if 'fastdeployserve' not in psutil.Process(pid).name(
+        ):  # We should judge the pid is fastdeployserver process, in case pid has been reassigned.
+            # Note: I do not know why psutil.Process(pid).name() is fastdeployserve but not fastdeployserver.
+            return False
+        else:
+            return True
+_metric_column_name = {
+    "Model": {
+        "nv_inference_request_success", "nv_inference_request_failure",
+        "nv_inference_count", "nv_inference_exec_count",
+        "nv_inference_request_duration_us", "nv_inference_queue_duration_us",
+        "nv_inference_compute_input_duration_us",
+        "nv_inference_compute_infer_duration_us",
+        "nv_inference_compute_output_duration_us"
+    },
+    "GPU": {
+        "nv_gpu_power_usage", "nv_gpu_power_limit", "nv_energy_consumption",
+        "nv_gpu_utilization", "nv_gpu_memory_total_bytes",
+        "nv_gpu_memory_used_bytes"
+    },
+    "CPU": {
+        "nv_cpu_utilization", "nv_cpu_memory_total_bytes",
+        "nv_cpu_memory_used_bytes"
+    }
+}
+def generate_metric_table(server_addr, server_port):  # noqa:C901
+    model_table = {}
+    gpu_table = {}
+    try:
+        res = requests.get("http://{}:{}/metrics".format(
+            server_addr, server_port))
+    except Exception:
+        return None
+    metric_content = res.text
+    for content in metric_content.split('\n'):
+        if content.startswith('#'):
+            continue
+        else:
+            res = re.match(r'(\w+){(.*)} (\w+)',
+                           content)  # match output by server metrics interface
+            if not res:
+                continue
+            metric_name = res.group(1)
+            model = res.group(2)
+            value = res.group(3)
+            infos = {}
+            for info in model.split(','):
+                k, v = info.split('=')
+                v = v.strip('"')
+                infos[k] = v
+            if metric_name in [
+                    "nv_inference_request_duration_us",
+                    "nv_inference_queue_duration_us",
+                    "nv_inference_compute_input_duration_us",
+                    "nv_inference_compute_infer_duration_us",
+                    "nv_inference_compute_output_duration_us"
+            ]:
+                value = float(value) / 1000
+            elif metric_name in [
+                    "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes"
+            ]:
+                value = float(value) / 1024 / 1024 / 1024
+            for key, metric_names in _metric_column_name.items():
+                if metric_name in metric_names:
+                    if key == 'Model':
+                        model_name = infos['model']
+                        if model_name not in model_table:
+                            model_table[model_name] = {}
+                        model_table[model_name][metric_name] = value
+                    elif key == 'GPU':
+                        gpu_name = infos['gpu_uuid']
+                        if gpu_name not in gpu_table:
+                            gpu_table[gpu_name] = {}
+                        gpu_table[gpu_name][metric_name] = value
+                    elif key == 'CPU':
+                        pass
+    results = {}
+    results['Model'] = model_table
+    results['GPU'] = gpu_table
+    return results
--- a/visualdl/component/inference/fastdeploy_server.py
+++ b/visualdl/component/inference/fastdeploy_server.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import datetime
+import json
+import os
+import re
+import shutil
+import socket
+import time
+from multiprocessing import Process
+from pathlib import Path
+import requests
+from .fastdeploy_client.client_app import create_gradio_client_app
+from .fastdeploy_lib import analyse_config
+from .fastdeploy_lib import check_process_zombie
+from .fastdeploy_lib import copy_config_file_to_default_config
+from .fastdeploy_lib import delete_files_for_process
+from .fastdeploy_lib import exchange_format_to_original_format
+from .fastdeploy_lib import generate_metric_table
+from .fastdeploy_lib import get_alive_fastdeploy_servers
+from .fastdeploy_lib import get_config_filenames_for_one_model
+from .fastdeploy_lib import get_config_for_one_model
+from .fastdeploy_lib import get_process_model_configuration
+from .fastdeploy_lib import get_process_output
+from .fastdeploy_lib import get_start_arguments
+from .fastdeploy_lib import json2pbtxt
+from .fastdeploy_lib import kill_process
+from .fastdeploy_lib import launch_process
+from .fastdeploy_lib import mark_pid_for_dead_process
+from .fastdeploy_lib import original_format_to_exchange_format
+from .fastdeploy_lib import validate_data
+from visualdl.server.api import gen_result
+from visualdl.server.api import result
+from visualdl.utils.dir import FASTDEPLOYSERVER_PATH
+class FastDeployServerApi(object):
+    def __init__(self):
+        self.root_dir = Path(os.getcwd())
+        self.opened_servers = {
+        }  # Use to store the opened server process pid and process itself
+        self.client_port = None
+    @result()
+    def get_directory(self, cur_dir):
+        if self.root_dir not in Path(os.path.abspath(cur_dir)).parents:
+            cur_dir = '.'
+        cur_dir, sub_dirs, filenames = os.walk(cur_dir).send(None)
+        if Path(self.root_dir) != Path(os.path.abspath(cur_dir)):
+            sub_dirs.append('..')
+        sub_dirs = sorted(sub_dirs)
+        directorys = {
+            'parent_dir':
+            os.path.relpath(Path(os.path.abspath(cur_dir)), self.root_dir),
+            'sub_dir':
+            sub_dirs
+        }
+        return directorys
+    @result()
+    def get_config(self, cur_dir):
+        all_model_configs, all_model_versions = analyse_config(cur_dir)
+        return original_format_to_exchange_format(all_model_configs,
+                                                  all_model_versions)
+    @result()
+    def config_update(self, cur_dir, model_name, config, config_filename):
+        config = json.loads(config)
+        all_models = exchange_format_to_original_format(config)
+        model_dir = os.path.join(os.path.abspath(cur_dir), model_name)
+        filtered_config = validate_data(all_models[model_name])
+        text_proto = json2pbtxt(json.dumps(filtered_config))
+        # backup user's config data first, when data corrupted by front-end, we still can recovery data
+        # backup config filename: {original_name}_vdlbackup_{datetime}.pbtxt
+        # backup config can only used to restore config.pbtxt
+        if 'vdlbackup' in config_filename:
+            raise RuntimeError(
+                "Backup config file is not permitted to update.")
+        basename = os.path.splitext(config_filename)[0]
+        shutil.copy(
+            os.path.join(model_dir, config_filename),
+            os.path.join(
+                model_dir, '{}_vdlbackup_{}.pbtxt'.format(
+                    basename,
+                    datetime.datetime.now().isoformat())))
+        with open(os.path.join(model_dir, config_filename), 'w') as f:
+            f.write(text_proto)
+        return
+    @result()
+    def start_server(self, configs):
+        configs = json.loads(configs)
+        process = launch_process(configs)
+        if process.poll() is not None:
+            raise RuntimeError(
+                "Failed to launch fastdeployserver，please check fastdeployserver is installed in environment."
+            )
+        server_name = configs['server-name'] if configs[
+            'server-name'] else str(process.pid)
+        self.opened_servers[server_name] = process
+        return server_name
+    @result()
+    def stop_server(self, server_id):
+        if server_id in self.opened_servers:  # check if server_id in self.opened_servers
+            kill_process(self.opened_servers[server_id])
+            del self.opened_servers[server_id]
+        elif server_id in set(
+                os.listdir(FASTDEPLOYSERVER_PATH)):  # check if server_id in
+            # FASTDEPLOYSERVER_PATH(may be launched by other vdl app instance by gunicorn)
+            kill_process(server_id)
+        delete_files_for_process(server_id)
+        self._poll_zombie_process()
+    @result('text/plain')
+    def get_server_output(self, server_id, length):
+        length = int(length)
+        if server_id in self.opened_servers:  # check if server_id in self.opened_servers
+            return get_process_output(server_id, length)
+        elif str(server_id) in set(
+                os.listdir(FASTDEPLOYSERVER_PATH)):  # check if server_id in
+            # FASTDEPLOYSERVER_PATH(may be launched by other vdl app instance by gunicorn)
+            return get_process_output(server_id, length)
+        else:
+            return
+    @result()
+    def get_server_metric(self, server_id):
+        args = get_start_arguments(server_id)
+        host = 'localhost'
+        port = args.get('metrics-port', 8002)
+        return generate_metric_table(host, port)
+    @result()
+    def get_server_list(self):
+        return get_alive_fastdeploy_servers()
+    @result()
+    def check_server_alive(self, server_id):
+        self._poll_zombie_process()
+        if check_process_zombie(server_id) is True:
+            raise RuntimeError(
+                "Server {} is down due to exception or killed，please check the reason according to the log, \
+                then close this server.".format(server_id))
+        return
+    @result()
+    def get_server_config(self, server_id):
+        return get_process_model_configuration(server_id)
+    @result()
+    def get_pretrain_model_list(self):
+        '''
+        Get all available fastdeploy models from hub server.
+        '''
+        res = requests.get(
+            'http://paddlepaddle.org.cn/paddlehub/fastdeploy_listmodels')
+        result = res.json()
+        if result['status'] != 0:
+            raise RuntimeError(
+                "Failed to get pre-trained model list from hub server.")
+        else:
+            data = result['data']
+            model_list = {}
+            for category, models in data.items():
+                if category not in model_list:
+                    model_list[category] = set()
+                for model in models:
+                    model_list[category].add(model['name'])
+            # adapt data format for frontend
+            models_info = []
+            for category, model_names in model_list.items():
+                models_info.append({
+                    "value": category,
+                    "label": category,
+                    "children": []
+                })
+                for model_name in sorted(model_names):
+                    models_info[-1]["children"].append({
+                        "value": model_name,
+                        "label": model_name
+                    })
+            return models_info
+    @result()
+    def download_pretrain_model(self, cur_dir, model_name, version,
+                                pretrain_model_name):
+        version_resource_dir = os.path.join(
+            os.path.abspath(cur_dir), model_name, version)
+        try:
+            import fastdeploy as fd
+        except Exception:
+            raise RuntimeError(
+                "fastdeploy is required for visualizing results，please refer to \
+            https://github.com/PaddlePaddle/FastDeploy to install fastdeploy")
+        model_path = fd.download_model(
+            name=pretrain_model_name, path=version_resource_dir)
+        if model_path:
+            if '.onnx' in model_path:
+                shutil.move(
+                    model_path,
+                    os.path.join(os.path.dirname(model_path), 'model.onnx'))
+            else:
+                for filename in os.listdir(model_path):
+                    if '.pdmodel' in filename or '.pdiparams' in filename:
+                        shutil.move(
+                            os.path.join(model_path, filename),
+                            os.path.join(
+                                os.path.dirname(model_path), 'model{}'.format(
+                                    os.path.splitext(filename)[1])))
+                    else:
+                        shutil.move(
+                            os.path.join(model_path, filename),
+                            os.path.join(
+                                os.path.dirname(model_path), filename))
+                shutil.rmtree(model_path)
+            version_info_for_frontend = []
+            for version_name in os.listdir(os.path.join(cur_dir, model_name)):
+                if re.match(
+                        r'\d+',
+                        version_name):  # version directory consists of numbers
+                    version_filenames_dict_for_frontend = {}
+                    version_filenames_dict_for_frontend['title'] = version_name
+                    version_filenames_dict_for_frontend['key'] = version_name
+                    version_filenames_dict_for_frontend['children'] = []
+                    for filename in os.listdir(
+                            os.path.join(cur_dir, model_name, version_name)):
+                        version_filenames_dict_for_frontend['children'].append(
+                            {
+                                'title': filename,
+                                'key': filename
+                            })
+                    version_info_for_frontend.append(
+                        version_filenames_dict_for_frontend)
+            return version_info_for_frontend
+        else:
+            raise RuntimeError(
+                "Failed to download pre-trained model {}.".format(
+                    pretrain_model_name))
+    @result()
+    def get_config_for_model(self, cur_dir, name, config_filename):
+        return get_config_for_one_model(cur_dir, name, config_filename)
+    @result()
+    def get_config_filenames_for_model(self, cur_dir, name):
+        return get_config_filenames_for_one_model(cur_dir, name)
+    @result()
+    def delete_config_for_model(self, cur_dir, name, config_filename):
+        if self.root_dir not in Path(
+                os.path.abspath(cur_dir)
+        ).parents:  # should prevent user remove files outside model-repository
+            raise RuntimeError(
+                'Failed to delete config file, please check filepath.')
+        if os.path.exists(os.path.join(cur_dir, name, config_filename)):
+            os.remove(os.path.join(cur_dir, name, config_filename))
+        return get_config_filenames_for_one_model(cur_dir, name)
+    @result()
+    def set_default_config_for_model(self, cur_dir, name, config_filename):
+        model_dir = os.path.join(os.path.abspath(cur_dir), name)
+        # backup config.pbtxt to config_vdlbackup_{datetime}.pbtxt
+        if os.path.exists(os.path.join(model_dir, 'config.pbtxt')):
+            shutil.copy(
+                os.path.join(model_dir, 'config.pbtxt'),
+                os.path.join(
+                    model_dir, 'config_vdlbackup_{}.pbtxt'.format(
+                        datetime.datetime.now().isoformat())))
+        if config_filename != 'config.pbtxt':
+            copy_config_file_to_default_config(model_dir, config_filename)
+        return
+    @result()
+    def delete_resource_for_model(self, cur_dir, model_name, version,
+                                  resource_filename):
+        if self.root_dir not in Path(
+                os.path.abspath(cur_dir)
+        ).parents:  # should prevent user remove files outside model-repository
+            raise RuntimeError(
+                'Failed to delete resource file, please check filepath.')
+        resource_path = os.path.join(
+            os.path.abspath(cur_dir), model_name, version, resource_filename)
+        if os.path.exists(resource_path):
+            os.remove(resource_path)
+        version_info_for_frontend = []
+        for version_name in os.listdir(os.path.join(cur_dir, model_name)):
+            if re.match(r'\d+',
+                        version_name):  # version directory consists of numbers
+                version_filenames_dict_for_frontend = {}
+                version_filenames_dict_for_frontend['title'] = version_name
+                version_filenames_dict_for_frontend['key'] = version_name
+                version_filenames_dict_for_frontend['children'] = []
+                for filename in os.listdir(
+                        os.path.join(cur_dir, model_name, version_name)):
+                    version_filenames_dict_for_frontend['children'].append({
+                        'title':
+                        filename,
+                        'key':
+                        filename
+                    })
+                version_info_for_frontend.append(
+                    version_filenames_dict_for_frontend)
+        return version_info_for_frontend
+    @result()
+    def rename_resource_for_model(self, cur_dir, model_name, version,
+                                  resource_filename, new_filename):
+        if self.root_dir not in Path(
+                os.path.abspath(cur_dir)
+        ).parents:  # should prevent user remove files outside model-repository
+            raise RuntimeError(
+                'Failed to rename resource file, please check filepath.')
+        resource_path = os.path.join(
+            os.path.abspath(cur_dir), model_name, version, resource_filename)
+        new_file_path = os.path.join(
+            os.path.abspath(cur_dir), model_name, version, new_filename)
+        if os.path.exists(resource_path):
+            shutil.move(resource_path, new_file_path)
+        version_info_for_frontend = []
+        for version_name in os.listdir(os.path.join(cur_dir, model_name)):
+            if re.match(r'\d+',
+                        version_name):  # version directory consists of numbers
+                version_filenames_dict_for_frontend = {}
+                version_filenames_dict_for_frontend['title'] = version_name
+                version_filenames_dict_for_frontend['key'] = version_name
+                version_filenames_dict_for_frontend['children'] = []
+                for filename in os.listdir(
+                        os.path.join(cur_dir, model_name, version_name)):
+                    version_filenames_dict_for_frontend['children'].append({
+                        'title':
+                        filename,
+                        'key':
+                        filename
+                    })
+                version_info_for_frontend.append(
+                    version_filenames_dict_for_frontend)
+        return version_info_for_frontend
+    def create_fastdeploy_client(self):
+        if self.client_port is None:
+            def get_free_tcp_port():
+                tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                # tcp.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
+                tcp.bind(('localhost', 0))
+                addr, port = tcp.getsockname()
+                tcp.close()
+                return port
+            self.client_port = get_free_tcp_port()
+            app = create_gradio_client_app()
+            thread = Process(
+                target=app.launch, kwargs={'server_port': self.client_port})
+            thread.start()
+            def check_alive():
+                while True:
+                    try:
+                        requests.get('http://localhost:{}/'.format(
+                            self.client_port))
+                        break
+                    except Exception:
+                        time.sleep(1)
+            check_alive()
+        return self.client_port
+    def _poll_zombie_process(self):
+        # check if there are servers killed by other vdl app instance and become zoombie
+        should_delete = []
+        for server_id, process in self.opened_servers.items():
+            if process.poll() is not None:
+                mark_pid_for_dead_process(server_id)
+                should_delete.append(server_id)
+        for server_id in should_delete:
+            del self.opened_servers[server_id]
+def create_fastdeploy_api_call():
+    api = FastDeployServerApi()
+    routes = {
+        'get_directory': (api.get_directory, ['dir']),
+        'config_update': (api.config_update,
+                          ['dir', 'name', 'config', 'config_filename']),
+        'get_config': (api.get_config, ['dir']),
+        'get_config_filenames_for_model': (api.get_config_filenames_for_model,
+                                           ['dir', 'name']),
+        'get_config_for_model': (api.get_config_for_model,
+                                 ['dir', 'name', 'config_filename']),
+        'set_default_config_for_model': (api.set_default_config_for_model,
+                                         ['dir', 'name', 'config_filename']),
+        'delete_config_for_model': (api.delete_config_for_model,
+                                    ['dir', 'name', 'config_filename']),
+        'start_server': (api.start_server, ['config']),
+        'stop_server': (api.stop_server, ['server_id']),
+        'get_server_output': (api.get_server_output, ['server_id', 'length']),
+        'create_fastdeploy_client': (api.create_fastdeploy_client, []),
+        'get_server_list': (api.get_server_list, []),
+        'get_server_metric': (api.get_server_metric, ['server_id']),
+        'get_server_config': (api.get_server_config, ['server_id']),
+        'get_pretrain_model_list': (api.get_pretrain_model_list, []),
+        'check_server_alive': (api.check_server_alive, ['server_id']),
+        'download_pretrain_model':
+        (api.download_pretrain_model,
+         ['dir', 'name', 'version', 'pretrain_model_name']),
+        'delete_resource_for_model':
+        (api.delete_resource_for_model,
+         ['dir', 'name', 'version', 'resource_filename']),
+        'rename_resource_for_model': (api.rename_resource_for_model, [
+            'dir', 'name', 'version', 'resource_filename', 'new_filename'
+        ])
+    }
+    def call(path: str, args):
+        route = routes.get(path)
+        if not route:
+            return json.dumps(gen_result(
+                status=1, msg='api not found')), 'application/json', None
+        method, call_arg_names = route
+        call_args = [args.get(name) for name in call_arg_names]
+        return method(*call_args)
+    return call
--- a/visualdl/component/inference/proto/__init__.py
+++ b/visualdl/component/inference/proto/__init__.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
--- a/visualdl/component/inference/proto/model_config.protxt
+++ b/visualdl/component/inference/proto/model_config.protxt
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2018, TensorFlow Authors. All rights reserved.
+syntax = "proto3";
+package inference;
+//@@.. cpp:namespace:: inference
+//@@
+//@@.. cpp:enum:: DataType
+//@@
+//@@   Data types supported for input and output tensors.
+//@@
+enum DataType {
+  //@@  .. cpp:enumerator:: DataType::INVALID = 0
+  TYPE_INVALID = 0;
+  //@@  .. cpp:enumerator:: DataType::BOOL = 1
+  TYPE_BOOL = 1;
+  //@@  .. cpp:enumerator:: DataType::UINT8 = 2
+  TYPE_UINT8 = 2;
+  //@@  .. cpp:enumerator:: DataType::UINT16 = 3
+  TYPE_UINT16 = 3;
+  //@@  .. cpp:enumerator:: DataType::UINT32 = 4
+  TYPE_UINT32 = 4;
+  //@@  .. cpp:enumerator:: DataType::UINT64 = 5
+  TYPE_UINT64 = 5;
+  //@@  .. cpp:enumerator:: DataType::INT8 = 6
+  TYPE_INT8 = 6;
+  //@@  .. cpp:enumerator:: DataType::INT16 = 7
+  TYPE_INT16 = 7;
+  //@@  .. cpp:enumerator:: DataType::INT32 = 8
+  TYPE_INT32 = 8;
+  //@@  .. cpp:enumerator:: DataType::INT64 = 9
+  TYPE_INT64 = 9;
+  //@@  .. cpp:enumerator:: DataType::FP16 = 10
+  TYPE_FP16 = 10;
+  //@@  .. cpp:enumerator:: DataType::FP32 = 11
+  TYPE_FP32 = 11;
+  //@@  .. cpp:enumerator:: DataType::FP64 = 12
+  TYPE_FP64 = 12;
+  //@@  .. cpp:enumerator:: DataType::STRING = 13
+  TYPE_STRING = 13;
+  //@@  .. cpp:enumerator:: DataType::BF16 = 14
+  TYPE_BF16 = 14;
+}
+//@@
+//@@  .. cpp:var:: message ModelRateLimiter
+//@@
+//@@     The specifications required by the rate limiter to properly
+//@@     schedule the inference requests across the different models
+//@@     and their instances.
+//@@
+message ModelRateLimiter
+{
+  //@@  .. cpp:var:: message Resource
+  //@@
+  //@@     The resource property.
+  //@@
+  message Resource
+  {
+    //@@  .. cpp:var:: string name
+    //@@
+    //@@     The name associated with the resource.
+    //@@
+    string name = 1;
+    //@@  .. cpp:var:: bool global
+    //@@
+    //@@     Whether or not the resource is global. If true then the resource
+    //@@     is assumed to be shared among the devices otherwise specified
+    //@@     count of the resource is assumed for each device associated
+    //@@     with the instance.
+    //@@
+    bool global = 2;
+    //@@  .. cpp:var:: uint32 count
+    //@@
+    //@@     The number of resources required for the execution of the model
+    //@@     instance.
+    //@@
+    uint32 count = 3;
+  }
+  //@@  .. cpp:var:: Resource resources (repeated)
+  //@@
+  //@@     The resources required to execute the request on a model instance.
+  //@@     Resources are just names with a corresponding count. The execution
+  //@@     of the instance will be blocked until the specificied resources are
+  //@@     available. By default an instance uses no rate-limiter resources.
+  //@@
+  repeated Resource resources = 1;
+  //@@  .. cpp:var:: uint32 priority
+  //@@
+  //@@     The optional weighting value to be used for prioritizing across
+  //@@     instances. An instance with priority 2 will be given 1/2 the
+  //@@     number of scheduling chances as an instance_group with priority
+  //@@     1. The default priority is 1. The priority of value 0 will be
+  //@@     treated as priority 1.
+  //@@
+  uint32 priority = 2;
+}
+//@@
+//@@.. cpp:var:: message ModelInstanceGroup
+//@@
+//@@   A group of one or more instances of a model and resources made
+//@@   available for those instances.
+//@@
+message ModelInstanceGroup
+{
+  //@@
+  //@@  .. cpp:enum:: Kind
+  //@@
+  //@@     Kind of this instance group.
+  //@@
+  enum Kind {
+    //@@    .. cpp:enumerator:: Kind::KIND_AUTO = 0
+    //@@
+    //@@       This instance group represents instances that can run on either
+    //@@       CPU or GPU. If all GPUs listed in 'gpus' are available then
+    //@@       instances will be created on GPU(s), otherwise instances will
+    //@@       be created on CPU.
+    //@@
+    KIND_AUTO = 0;
+    //@@    .. cpp:enumerator:: Kind::KIND_GPU = 1
+    //@@
+    //@@       This instance group represents instances that must run on the
+    //@@       GPU.
+    //@@
+    KIND_GPU = 1;
+    //@@    .. cpp:enumerator:: Kind::KIND_CPU = 2
+    //@@
+    //@@       This instance group represents instances that must run on the
+    //@@       CPU.
+    //@@
+    KIND_CPU = 2;
+    //@@    .. cpp:enumerator:: Kind::KIND_MODEL = 3
+    //@@
+    //@@       This instance group represents instances that should run on the
+    //@@       CPU and/or GPU(s) as specified by the model or backend itself.
+    //@@       The inference server will not override the model/backend
+    //@@       settings.
+    //@@
+    KIND_MODEL = 3;
+  }
+  //@@
+  //@@  .. cpp:var:: message SecondaryDevice
+  //@@
+  //@@     A secondary device required for a model instance.
+  //@@
+  message SecondaryDevice
+  {
+    //@@
+    //@@  .. cpp:enum:: SecondaryDeviceKind
+    //@@
+    //@@     The kind of the secondary device.
+    //@@
+    enum SecondaryDeviceKind {
+      //@@    .. cpp:enumerator:: SecondaryDeviceKind::KIND_NVDLA = 0
+      //@@
+      //@@       An NVDLA core. http://nvdla.org
+      //@@       Currently KIND_NVDLA is only supported by the TensorRT backend.
+      //@@
+      KIND_NVDLA = 0;
+    }
+    //@@  .. cpp:var:: SecondaryDeviceKind kind
+    //@@
+    //@@     The secondary device kind.
+    //@@
+    SecondaryDeviceKind kind = 1;
+    //@@  .. cpp:var:: int64 device_id
+    //@@
+    //@@     Identifier for the secondary device.
+    //@@
+    int64 device_id = 2;
+  }
+  //@@  .. cpp:var:: string name
+  //@@
+  //@@     Optional name of this group of instances. If not specified the
+  //@@     name will be formed as <model name>_<group number>. The name of
+  //@@     individual instances will be further formed by a unique instance
+  //@@     number and GPU index:
+  //@@
+  string name = 1;
+  //@@  .. cpp:var:: Kind kind
+  //@@
+  //@@     The kind of this instance group. Default is KIND_AUTO. If
+  //@@     KIND_AUTO or KIND_GPU then both 'count' and 'gpu' are valid and
+  //@@     may be specified. If KIND_CPU or KIND_MODEL only 'count' is valid
+  //@@     and 'gpu' cannot be specified.
+  //@@
+  Kind kind = 4;
+  //@@  .. cpp:var:: int32 count
+  //@@
+  //@@     For a group assigned to GPU, the number of instances created for
+  //@@     each GPU listed in 'gpus'. For a group assigned to CPU the number
+  //@@     of instances created. Default is 1.
+  int32 count = 2;
+  //@@  .. cpp:var:: ModelRateLimiter rate_limiter
+  //@@
+  //@@     The rate limiter specific settings to be associated with this
+  //@@     instance group. Optional, if not specified no rate limiting
+  //@@     will be applied to this instance group.
+  //@@
+  ModelRateLimiter rate_limiter = 6;
+  //@@  .. cpp:var:: int32 gpus (repeated)
+  //@@
+  //@@     GPU(s) where instances should be available. For each GPU listed,
+  //@@     'count' instances of the model will be available. Setting 'gpus'
+  //@@     to empty (or not specifying at all) is eqivalent to listing all
+  //@@     available GPUs.
+  //@@
+  repeated int32 gpus = 3;
+  //@@  .. cpp:var:: SecondaryDevice secondary_devices (repeated)
+  //@@
+  //@@     Secondary devices that are required by instances specified by this
+  //@@     instance group. Optional.
+  //@@
+  repeated SecondaryDevice secondary_devices = 8;
+  //@@  .. cpp:var:: string profile (repeated)
+  //@@
+  //@@     For TensorRT models containing multiple optimization profile, this
+  //@@     parameter specifies a set of optimization profiles available to this
+  //@@     instance group. The inference server will choose the optimal profile
+  //@@     based on the shapes of the input tensors. This field should lie
+  //@@     between 0 and <TotalNumberOfOptimizationProfilesInPlanModel> - 1
+  //@@     and be specified only for TensorRT backend, otherwise an error will
+  //@@     be generated. If not specified, the server will select the first
+  //@@     optimization profile by default.
+  //@@
+  repeated string profile = 5;
+  //@@  .. cpp:var:: bool passive
+  //@@
+  //@@     Whether the instances within this instance group will be accepting
+  //@@     inference requests from the scheduler. If true, the instances will
+  //@@     not be added to the scheduler. Default value is false.
+  //@@
+  bool passive = 7;
+  //@@  .. cpp:var:: string host_policy
+  //@@
+  //@@     The host policy name that the instance to be associated with.
+  //@@     The default value is set to reflect the device kind of the instance,
+  //@@     for instance, KIND_CPU is "cpu", KIND_MODEL is "model" and
+  //@@     KIND_GPU is "gpu_<gpu_id>".
+  //@@
+  string host_policy = 9;
+}
+//@@
+//@@.. cpp:var:: message ModelTensorReshape
+//@@
+//@@   Reshape specification for input and output tensors.
+//@@
+message ModelTensorReshape
+{
+  //@@  .. cpp:var:: int64 shape (repeated)
+  //@@
+  //@@     The shape to use for reshaping.
+  //@@
+  repeated int64 shape = 1;
+}
+//@@
+//@@.. cpp:var:: message ModelInput
+//@@
+//@@   An input required by the model.
+//@@
+message ModelInput
+{
+  //@@
+  //@@  .. cpp:enum:: Format
+  //@@
+  //@@     The format for the input.
+  //@@
+  enum Format {
+    //@@    .. cpp:enumerator:: Format::FORMAT_NONE = 0
+    //@@
+    //@@       The input has no specific format. This is the default.
+    //@@
+    FORMAT_NONE = 0;
+    //@@    .. cpp:enumerator:: Format::FORMAT_NHWC = 1
+    //@@
+    //@@       HWC image format. Tensors with this format require 3 dimensions
+    //@@       if the model does not support batching (max_batch_size = 0) or 4
+    //@@       dimensions if the model does support batching (max_batch_size
+    //@@       >= 1). In either case the 'dims' below should only specify the
+    //@@       3 non-batch dimensions (i.e. HWC or CHW).
+    //@@
+    FORMAT_NHWC = 1;
+    //@@    .. cpp:enumerator:: Format::FORMAT_NCHW = 2
+    //@@
+    //@@       CHW image format. Tensors with this format require 3 dimensions
+    //@@       if the model does not support batching (max_batch_size = 0) or 4
+    //@@       dimensions if the model does support batching (max_batch_size
+    //@@       >= 1). In either case the 'dims' below should only specify the
+    //@@       3 non-batch dimensions (i.e. HWC or CHW).
+    //@@
+    FORMAT_NCHW = 2;
+  }
+  //@@  .. cpp:var:: string name
+  //@@
+  //@@     The name of the input.
+  //@@
+  string name = 1;
+  //@@  .. cpp:var:: DataType data_type
+  //@@
+  //@@     The data-type of the input.
+  //@@
+  DataType data_type = 2;
+  //@@  .. cpp:var:: Format format
+  //@@
+  //@@     The format of the input. Optional.
+  //@@
+  Format format = 3;
+  //@@  .. cpp:var:: int64 dims (repeated)
+  //@@
+  //@@     The dimensions/shape of the input tensor that must be provided
+  //@@     when invoking the inference API for this model.
+  //@@
+  repeated int64 dims = 4;
+  //@@  .. cpp:var:: ModelTensorReshape reshape
+  //@@
+  //@@     The shape expected for this input by the backend. The input will
+  //@@     be reshaped to this before being presented to the backend. The
+  //@@     reshape must have the same number of elements as the input shape
+  //@@     specified by 'dims'. Optional.
+  //@@
+  ModelTensorReshape reshape = 5;
+  //@@  .. cpp:var:: bool is_shape_tensor
+  //@@
+  //@@     Whether or not the input is a shape tensor to the model. This field
+  //@@     is currently supported only for the TensorRT model. An error will be
+  //@@     generated if this specification does not comply with underlying
+  //@@     model.
+  //@@
+  bool is_shape_tensor = 6;
+  //@@  .. cpp:var:: bool allow_ragged_batch
+  //@@
+  //@@     Whether or not the input is allowed to be "ragged" in a dynamically
+  //@@     created batch. Default is false indicating that two requests will
+  //@@     only be batched if this tensor has the same shape in both requests.
+  //@@     True indicates that two requests can be batched even if this tensor
+  //@@     has a different shape in each request.
+  //@@
+  bool allow_ragged_batch = 7;
+  //@@  .. cpp:var:: bool optional
+  //@@
+  //@@     Whether or not the input is optional for the model execution.
+  //@@     If true, the input is not required in the inference request.
+  //@@     Default value is false.
+  //@@
+  bool optional = 8;
+}
+//@@
+//@@.. cpp:var:: message ModelOutput
+//@@
+//@@   An output produced by the model.
+//@@
+message ModelOutput
+{
+  //@@  .. cpp:var:: string name
+  //@@
+  //@@     The name of the output.
+  //@@
+  string name = 1;
+  //@@  .. cpp:var:: DataType data_type
+  //@@
+  //@@     The data-type of the output.
+  //@@
+  DataType data_type = 2;
+  //@@  .. cpp:var:: int64 dims (repeated)
+  //@@
+  //@@     The dimensions/shape of the output tensor.
+  //@@
+  repeated int64 dims = 3;
+  //@@  .. cpp:var:: ModelTensorReshape reshape
+  //@@
+  //@@     The shape produced for this output by the backend. The output will
+  //@@     be reshaped from this to the shape specifed in 'dims' before being
+  //@@     returned in the inference response. The reshape must have the same
+  //@@     number of elements as the output shape specified by 'dims'. Optional.
+  //@@
+  ModelTensorReshape reshape = 5;
+  //@@  .. cpp:var:: string label_filename
+  //@@
+  //@@     The label file associated with this output. Should be specified only
+  //@@     for outputs that represent classifications. Optional.
+  //@@
+  string label_filename = 4;
+  //@@  .. cpp:var:: bool is_shape_tensor
+  //@@
+  //@@     Whether or not the output is a shape tensor to the model. This field
+  //@@     is currently supported only for the TensorRT model. An error will be
+  //@@     generated if this specification does not comply with underlying
+  //@@     model.
+  //@@
+  bool is_shape_tensor = 6;
+}
+//@@  .. cpp:var:: message BatchInput
+//@@
+//@@     A batch input is an additional input that must be added by
+//@@     the backend based on all the requests in a batch.
+//@@
+message BatchInput
+{
+  //@@
+  //@@    .. cpp:enum:: Kind
+  //@@
+  //@@       The kind of the batch input.
+  //@@
+  enum Kind {
+    //@@      .. cpp:enumerator:: Kind::BATCH_ELEMENT_COUNT = 0
+    //@@
+    //@@         The element count of the 'source_input' will be added as
+    //@@         input with shape [1].
+    //@@
+    BATCH_ELEMENT_COUNT = 0;
+    //@@      .. cpp:enumerator:: Kind::BATCH_ACCUMULATED_ELEMENT_COUNT = 1
+    //@@
+    //@@         The accumulated element count of the 'source_input' will be
+    //@@         added as input with shape [1]. For example, if there is a
+    //@@         batch of two request, each with 2 elements, an input of value
+    //@@         2 will be added to the first request, and an input of value
+    //@@         4 will be added to the second request.
+    //@@
+    BATCH_ACCUMULATED_ELEMENT_COUNT = 1;
+    //@@      .. cpp:enumerator::
+    //@@         Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2
+    //@@
+    //@@         The accumulated element count of the 'source_input' will be
+    //@@         added as input with shape [1], except for the first request
+    //@@         in the batch. For the first request in the batch, the input
+    //@@         will have shape [2] where the first element is value 0.
+    //@@
+    BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2;
+    //@@      .. cpp:enumerator:: Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3
+    //@@
+    //@@         Among the requests in the batch, the max element count of the
+    //@@         'source_input' will be added as input with shape
+    //@@         [max_element_count] for the first request in the batch.
+    //@@         For other requests, such input will be with shape [0].
+    //@@         The data of the tensor will be uninitialized.
+    //@@
+    BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3;
+    //@@      .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE = 4
+    //@@
+    //@@         Among the requests in the batch, the shape of the
+    //@@         'source_input' will be added as input with shape
+    //@@         [batch_size, len(input_dim)]. For example, if one
+    //@@         batch-2 input with shape [3, 1] and batch-1 input
+    //@@         with shape [2, 2] are batched, the batch input will
+    //@@         have shape [3, 2] and value [ [3, 1], [3, 1], [2, 2]].
+    //@@
+    BATCH_ITEM_SHAPE = 4;
+    //@@      .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE_FLATTEN = 5
+    //@@
+    //@@         Among the requests in the batch, the shape of the
+    //@@         'source_input' will be added as input with single dimensional
+    //@@         shape [batch_size * len(input_dim)]. For example, if one
+    //@@         batch-2 input with shape [3, 1] and batch-1 input
+    //@@         with shape [2, 2] are batched, the batch input will
+    //@@         have shape [6] and value [3, 1, 3, 1, 2, 2].
+    //@@
+    BATCH_ITEM_SHAPE_FLATTEN = 5;
+  }
+  //@@    .. cpp:var:: Kind kind
+  //@@
+  //@@       The kind of this batch input.
+  //@@
+  Kind kind = 1;
+  //@@    .. cpp:var:: string target_name (repeated)
+  //@@
+  //@@       The name of the model inputs that the backend will create
+  //@@       for this batch input.
+  //@@
+  repeated string target_name = 2;
+  //@@    .. cpp:var:: DataType data_type
+  //@@
+  //@@       The input's datatype. The data type can be TYPE_INT32 or
+  //@@       TYPE_FP32.
+  //@@
+  DataType data_type = 3;
+  //@@    .. cpp:var:: string source_input (repeated)
+  //@@
+  //@@       The backend derives the value for each batch input from one or
+  //@@       more other inputs. 'source_input' gives the names of those
+  //@@       inputs.
+  //@@
+  repeated string source_input = 4;
+}
+//@@.. cpp:var:: message BatchOutput
+//@@
+//@@   A batch output is an output produced by the model that must be handled
+//@@   differently by the backend based on all the requests in a batch.
+//@@
+message BatchOutput
+{
+  //@@
+  //@@  .. cpp:enum:: Kind
+  //@@
+  //@@     The kind of the batch output.
+  //@@
+  enum Kind {
+    //@@    .. cpp:enumerator:: Kind::BATCH_SCATTER_WITH_INPUT_SHAPE = 0
+    //@@
+    //@@       The output should be scattered according to the shape of
+    //@@       'source_input'. The dynamic dimension of the output will
+    //@@       be set to the value of the same dimension in the input.
+    //@@
+    BATCH_SCATTER_WITH_INPUT_SHAPE = 0;
+  }
+  //@@  .. cpp:var:: string target_name (repeated)
+  //@@
+  //@@     The name of the outputs to be produced by this batch output
+  //@@     specification.
+  //@@
+  repeated string target_name = 1;
+  //@@  .. cpp:var:: Kind kind
+  //@@
+  //@@     The kind of this batch output.
+  //@@
+  Kind kind = 2;
+  //@@  .. cpp:var:: string source_input (repeated)
+  //@@
+  //@@     The backend derives each batch output from one or more inputs.
+  //@@     'source_input' gives the names of those inputs.
+  //@@
+  repeated string source_input = 3;
+}
+//@@
+//@@.. cpp:var:: message ModelVersionPolicy
+//@@
+//@@   Policy indicating which versions of a model should be made
+//@@   available by the inference server.
+//@@
+message ModelVersionPolicy
+{
+  //@@  .. cpp:var:: message Latest
+  //@@
+  //@@     Serve only the latest version(s) of a model. This is
+  //@@     the default policy.
+  //@@
+  message Latest
+  {
+    //@@    .. cpp:var:: uint32 num_versions
+    //@@
+    //@@       Serve only the 'num_versions' highest-numbered versions. T
+    //@@       The default value of 'num_versions' is 1, indicating that by
+    //@@       default only the single highest-number version of a
+    //@@       model will be served.
+    //@@
+    uint32 num_versions = 1;
+  }
+  //@@  .. cpp:var:: message All
+  //@@
+  //@@     Serve all versions of the model.
+  //@@
+  message All {}
+  //@@  .. cpp:var:: message Specific
+  //@@
+  //@@     Serve only specific versions of the model.
+  //@@
+  message Specific
+  {
+    //@@    .. cpp:var:: int64 versions (repeated)
+    //@@
+    //@@       The specific versions of the model that will be served.
+    //@@
+    repeated int64 versions = 1;
+  }
+  //@@  .. cpp:var:: oneof policy_choice
+  //@@
+  //@@     Each model must implement only a single version policy. The
+  //@@     default policy is 'Latest'.
+  //@@
+  oneof policy_choice
+  {
+    //@@    .. cpp:var:: Latest latest
+    //@@
+    //@@       Serve only latest version(s) of the model.
+    //@@
+    Latest latest = 1;
+    //@@    .. cpp:var:: All all
+    //@@
+    //@@       Serve all versions of the model.
+    //@@
+    All all = 2;
+    //@@    .. cpp:var:: Specific specific
+    //@@
+    //@@       Serve only specific version(s) of the model.
+    //@@
+    Specific specific = 3;
+  }
+}
+//@@
+//@@.. cpp:var:: message ModelOptimizationPolicy
+//@@
+//@@   Optimization settings for a model. These settings control if/how a
+//@@   model is optimized and prioritized by the backend framework when
+//@@   it is loaded.
+//@@
+message ModelOptimizationPolicy
+{
+  //@@
+  //@@  .. cpp:var:: message Graph
+  //@@
+  //@@     Enable generic graph optimization of the model. If not specified
+  //@@     the framework's default level of optimization is used. Supports
+  //@@     TensorFlow graphdef and savedmodel and Onnx models. For TensorFlow
+  //@@     causes XLA to be enabled/disabled for the model. For Onnx defaults
+  //@@     to enabling all optimizations, -1 enables only basic optimizations,
+  //@@     +1 enables only basic and extended optimizations.
+  //@@
+  message Graph
+  {
+    //@@    .. cpp:var:: int32 level
+    //@@
+    //@@       The optimization level. Defaults to 0 (zero) if not specified.
+    //@@
+    //@@         - -1: Disabled
+    //@@         -  0: Framework default
+    //@@         -  1+: Enable optimization level (greater values indicate
+    //@@            higher optimization levels)
+    //@@
+    int32 level = 1;
+  }
+  //@@
+  //@@  .. cpp:enum:: ModelPriority
+  //@@
+  //@@     Model priorities. A model will be given scheduling and execution
+  //@@     preference over models at lower priorities. Current model
+  //@@     priorities only work for TensorRT models.
+  //@@
+  enum ModelPriority {
+    //@@    .. cpp:enumerator:: ModelPriority::PRIORITY_DEFAULT = 0
+    //@@
+    //@@       The default model priority.
+    //@@
+    PRIORITY_DEFAULT = 0;
+    //@@    .. cpp:enumerator:: ModelPriority::PRIORITY_MAX = 1
+    //@@
+    //@@       The maximum model priority.
+    //@@
+    PRIORITY_MAX = 1;
+    //@@    .. cpp:enumerator:: ModelPriority::PRIORITY_MIN = 2
+    //@@
+    //@@       The minimum model priority.
+    //@@
+    PRIORITY_MIN = 2;
+  }
+  //@@
+  //@@  .. cpp:var:: message Cuda
+  //@@
+  //@@     CUDA-specific optimization settings.
+  //@@
+  message Cuda
+  {
+    //@@    .. cpp:var:: message GraphSpec
+    //@@
+    //@@       Specification of the CUDA graph to be captured.
+    //@@
+    message GraphSpec
+    {
+      //@@      .. cpp:var:: message Dims
+      //@@
+      //@@         Specification of tensor dimension.
+      //@@
+      message Shape
+      {
+        //@@        .. cpp:var:: int64 dim (repeated)
+        //@@
+        //@@           The dimension.
+        //@@
+        repeated int64 dim = 1;
+      }
+      message LowerBound
+      {
+        //@@      .. cpp:var:: int32 batch_size
+        //@@
+        //@@         The batch size of the CUDA graph. If 'max_batch_size' is 0,
+        //@@         'batch_size' must be set to 0. Otherwise, 'batch_size' must
+        //@@         be set to value between 1 and 'max_batch_size'.
+        //@@
+        int32 batch_size = 1;
+        //@@      .. cpp:var:: map<string, Shape> input
+        //@@
+        //@@         The specification of the inputs. 'Shape' is the shape of
+        //@@         the input without batching dimension.
+        //@@
+        map<string, Shape> input = 2;
+      }
+      //@@      .. cpp:var:: int32 batch_size
+      //@@
+      //@@         The batch size of the CUDA graph. If 'max_batch_size' is 0,
+      //@@         'batch_size' must be set to 0. Otherwise, 'batch_size' must
+      //@@         be set to value between 1 and 'max_batch_size'.
+      //@@
+      int32 batch_size = 1;
+      //@@      .. cpp:var:: map<string, Shape> input
+      //@@
+      //@@         The specification of the inputs. 'Shape' is the shape of the
+      //@@         input without batching dimension.
+      //@@
+      map<string, Shape> input = 2;
+      //@@      .. cpp:var:: LowerBound graph_lower_bound
+      //@@
+      //@@         Specify the lower bound of the CUDA graph. Optional.
+      //@@         If specified, the graph can be used for input shapes and
+      //@@         batch sizes that are in closed interval between the lower
+      //@@         bound specification and graph specification. For dynamic
+      //@@         shape model, this allows CUDA graphs to be launched
+      //@@         frequently without capturing all possible shape combinations.
+      //@@         However, using graph for shape combinations different from
+      //@@         the one used for capturing introduces uninitialized data for
+      //@@         execution and it may distort the inference result if
+      //@@         the model is sensitive to uninitialized data.
+      //@@
+      LowerBound graph_lower_bound = 3;
+    }
+    //@@    .. cpp:var:: bool graphs
+    //@@
+    //@@       Use CUDA graphs API to capture model operations and execute
+    //@@       them more efficiently. Default value is false.
+    //@@       Currently only recognized by TensorRT backend.
+    //@@
+    bool graphs = 1;
+    //@@    .. cpp:var:: bool busy_wait_events
+    //@@
+    //@@       Use busy-waiting to synchronize CUDA events to achieve minimum
+    //@@       latency from event complete to host thread to be notified, with
+    //@@       the cost of high CPU load. Default value is false.
+    //@@       Currently only recognized by TensorRT backend.
+    //@@
+    bool busy_wait_events = 2;
+    //@@    .. cpp:var:: GraphSpec graph_spec (repeated)
+    //@@
+    //@@       Specification of the CUDA graph to be captured. If not specified
+    //@@       and 'graphs' is true, the default CUDA graphs will be captured
+    //@@       based on model settings.
+    //@@       Currently only recognized by TensorRT backend.
+    //@@
+    repeated GraphSpec graph_spec = 3;
+    //@@    .. cpp:var:: bool output_copy_stream
+    //@@
+    //@@       Uses a CUDA stream separate from the inference stream to copy the
+    //@@       output to host. However, be aware that setting this option to
+    //@@       true will lead to an increase in the memory consumption of the
+    //@@       model as Triton will allocate twice as much GPU memory for its
+    //@@       I/O tensor buffers. Default value is false.
+    //@@       Currently only recognized by TensorRT backend.
+    //@@
+    bool output_copy_stream = 4;
+  }
+  //@@
+  //@@  .. cpp:var:: message ExecutionAccelerators
+  //@@
+  //@@     Specify the preferred execution accelerators to be used to execute
+  //@@     the model. Currently only recognized by ONNX Runtime backend and
+  //@@     TensorFlow backend.
+  //@@
+  //@@     For ONNX Runtime backend, it will deploy the model with the execution
+  //@@     accelerators by priority, the priority is determined based on the
+  //@@     order that they are set, i.e. the provider at the front has highest
+  //@@     priority. Overall, the priority will be in the following order:
+  //@@         <gpu_execution_accelerator> (if instance is on GPU)
+  //@@         CUDA Execution Provider     (if instance is on GPU)
+  //@@         <cpu_execution_accelerator>
+  //@@         Default CPU Execution Provider
+  //@@
+  message ExecutionAccelerators
+  {
+    //@@
+    //@@  .. cpp:var:: message Accelerator
+    //@@
+    //@@     Specify the accelerator to be used to execute the model.
+    //@@     Accelerator with the same name may accept different parameters
+    //@@     depending on the backends.
+    //@@
+    message Accelerator
+    {
+      //@@    .. cpp:var:: string name
+      //@@
+      //@@       The name of the execution accelerator.
+      //@@
+      string name = 1;
+      //@@    .. cpp:var:: map<string, string> parameters
+      //@@
+      //@@       Additional paremeters used to configure the accelerator.
+      //@@
+      map<string, string> parameters = 2;
+    }
+    //@@    .. cpp:var:: Accelerator gpu_execution_accelerator (repeated)
+    //@@
+    //@@       The preferred execution provider to be used if the model instance
+    //@@       is deployed on GPU.
+    //@@
+    //@@       For ONNX Runtime backend, possible value is "tensorrt" as name,
+    //@@       and no parameters are required.
+    //@@
+    //@@       For TensorFlow backend, possible values are "tensorrt",
+    //@@       "auto_mixed_precision", "gpu_io".
+    //@@
+    //@@       For "tensorrt", the following parameters can be specified:
+    //@@         "precision_mode": The precision used for optimization.
+    //@@         Allowed values are "FP32" and "FP16". Default value is "FP32".
+    //@@
+    //@@         "max_cached_engines": The maximum number of cached TensorRT
+    //@@         engines in dynamic TensorRT ops. Default value is 100.
+    //@@
+    //@@         "minimum_segment_size": The smallest model subgraph that will
+    //@@         be considered for optimization by TensorRT. Default value is 3.
+    //@@
+    //@@         "max_workspace_size_bytes": The maximum GPU memory the model
+    //@@         can use temporarily during execution. Default value is 1GB.
+    //@@
+    //@@       For "auto_mixed_precision", no parameters are required. If set,
+    //@@       the model will try to use FP16 for better performance.
+    //@@       This optimization can not be set with "tensorrt".
+    //@@
+    //@@       For "gpu_io", no parameters are required. If set, the model will
+    //@@       be executed using TensorFlow Callable API to set input and output
+    //@@       tensors in GPU memory if possible, which can reduce data transfer
+    //@@       overhead if the model is used in ensemble. However, the Callable
+    //@@       object will be created on model creation and it will request all
+    //@@       outputs for every model execution, which may impact the
+    //@@       performance if a request does not require all outputs. This
+    //@@       optimization will only take affect if the model instance is
+    //@@       created with KIND_GPU.
+    //@@
+    repeated Accelerator gpu_execution_accelerator = 1;
+    //@@    .. cpp:var:: Accelerator cpu_execution_accelerator (repeated)
+    //@@
+    //@@       The preferred execution provider to be used if the model instance
+    //@@       is deployed on CPU.
+    //@@
+    //@@       For ONNX Runtime backend, possible value is "openvino" as name,
+    //@@       and no parameters are required.
+    //@@
+    repeated Accelerator cpu_execution_accelerator = 2;
+  }
+  //@@
+  //@@  .. cpp:var:: message PinnedMemoryBuffer
+  //@@
+  //@@     Specify whether to use a pinned memory buffer when transferring data
+  //@@     between non-pinned system memory and GPU memory. Using a pinned
+  //@@     memory buffer for system from/to GPU transfers will typically provide
+  //@@     increased performance. For example, in the common use case where the
+  //@@     request provides inputs and delivers outputs via non-pinned system
+  //@@     memory, if the model instance accepts GPU IOs, the inputs will be
+  //@@     processed by two copies: from non-pinned system memory to pinned
+  //@@     memory, and from pinned memory to GPU memory. Similarly, pinned
+  //@@     memory will be used for delivering the outputs.
+  //@@
+  message PinnedMemoryBuffer
+  {
+    //@@    .. cpp:var:: bool enable
+    //@@
+    //@@       Use pinned memory buffer. Default is true.
+    //@@
+    bool enable = 1;
+  }
+  //@@  .. cpp:var:: Graph graph
+  //@@
+  //@@     The graph optimization setting for the model. Optional.
+  //@@
+  Graph graph = 1;
+  //@@  .. cpp:var:: ModelPriority priority
+  //@@
+  //@@     The priority setting for the model. Optional.
+  //@@
+  ModelPriority priority = 2;
+  //@@  .. cpp:var:: Cuda cuda
+  //@@
+  //@@     CUDA-specific optimization settings. Optional.
+  //@@
+  Cuda cuda = 3;
+  //@@  .. cpp:var:: ExecutionAccelerators execution_accelerators
+  //@@
+  //@@     The accelerators used for the model. Optional.
+  //@@
+  ExecutionAccelerators execution_accelerators = 4;
+  //@@  .. cpp:var:: PinnedMemoryBuffer input_pinned_memory
+  //@@
+  //@@     Use pinned memory buffer when the data transfer for inputs
+  //@@     is between GPU memory and non-pinned system memory.
+  //@@     Default is true.
+  //@@
+  PinnedMemoryBuffer input_pinned_memory = 5;
+  //@@  .. cpp:var:: PinnedMemoryBuffer output_pinned_memory
+  //@@
+  //@@     Use pinned memory buffer when the data transfer for outputs
+  //@@     is between GPU memory and non-pinned system memory.
+  //@@     Default is true.
+  //@@
+  PinnedMemoryBuffer output_pinned_memory = 6;
+  //@@  .. cpp:var:: uint32 gather_kernel_buffer_threshold
+  //@@
+  //@@     The backend may use a gather kernel to gather input data if the
+  //@@     device has direct access to the source buffer and the destination
+  //@@     buffer. In such case, the gather kernel will be used only if the
+  //@@     number of buffers to be gathered is greater or equal to
+  //@@     the specifed value. If 0, the gather kernel will be disabled.
+  //@@     Default value is 0.
+  //@@     Currently only recognized by TensorRT backend.
+  //@@
+  uint32 gather_kernel_buffer_threshold = 7;
+  //@@  .. cpp:var:: bool eager_batching
+  //@@
+  //@@     Start preparing the next batch before the model instance is ready
+  //@@     for the next inference. This option can be used to overlap the
+  //@@     batch preparation with model execution, with the trade-off that
+  //@@     the next batch might be smaller than what it could have been.
+  //@@     Default value is false.
+  //@@     Currently only recognized by TensorRT backend.
+  //@@
+  bool eager_batching = 8;
+}
+//@@
+//@@.. cpp:var:: message ModelQueuePolicy
+//@@
+//@@   Queue policy for inference requests.
+//@@
+message ModelQueuePolicy
+{
+  //@@
+  //@@  .. cpp:enum:: TimeoutAction
+  //@@
+  //@@     The action applied to timed-out requests.
+  //@@
+  enum TimeoutAction {
+    //@@    .. cpp:enumerator:: Action::REJECT = 0
+    //@@
+    //@@       Reject the request and return error message accordingly.
+    //@@
+    REJECT = 0;
+    //@@    .. cpp:enumerator:: Action::DELAY = 1
+    //@@
+    //@@       Delay the request until all other requests at the same
+    //@@       (or higher) priority levels that have not reached their timeouts
+    //@@       are processed. A delayed request will eventually be processed,
+    //@@       but may be delayed indefinitely due to newly arriving requests.
+    //@@
+    DELAY = 1;
+  }
+  //@@
+  //@@  .. cpp:var:: TimeoutAction timeout_action
+  //@@
+  //@@     The action applied to timed-out request.
+  //@@     The default action is REJECT.
+  //@@
+  TimeoutAction timeout_action = 1;
+  //@@
+  //@@  .. cpp:var:: uint64 default_timeout_microseconds
+  //@@
+  //@@     The default timeout for every request, in microseconds.
+  //@@     The default value is 0 which indicates that no timeout is set.
+  //@@
+  uint64 default_timeout_microseconds = 2;
+  //@@
+  //@@  .. cpp:var:: bool allow_timeout_override
+  //@@
+  //@@     Whether individual request can override the default timeout value.
+  //@@     When true, individual requests can set a timeout that is less than
+  //@@     the default timeout value but may not increase the timeout.
+  //@@     The default value is false.
+  //@@
+  bool allow_timeout_override = 3;
+  //@@
+  //@@  .. cpp:var:: uint32 max_queue_size
+  //@@
+  //@@     The maximum queue size for holding requests. A request will be
+  //@@     rejected immediately if it can't be enqueued because the queue is
+  //@@     full. The default value is 0 which indicates that no maximum
+  //@@     queue size is enforced.
+  //@@
+  uint32 max_queue_size = 4;
+}
+//@@
+//@@.. cpp:var:: message ModelDynamicBatching
+//@@
+//@@   Dynamic batching configuration. These settings control how dynamic
+//@@   batching operates for the model.
+//@@
+message ModelDynamicBatching
+{
+  //@@  .. cpp:var:: int32 preferred_batch_size (repeated)
+  //@@
+  //@@     Preferred batch sizes for dynamic batching. If a batch of one of
+  //@@     these sizes can be formed it will be executed immediately.  If
+  //@@     not specified a preferred batch size will be chosen automatically
+  //@@     based on model and GPU characteristics.
+  //@@
+  repeated int32 preferred_batch_size = 1;
+  //@@  .. cpp:var:: uint64 max_queue_delay_microseconds
+  //@@
+  //@@     The maximum time, in microseconds, a request will be delayed in
+  //@@     the scheduling queue to wait for additional requests for
+  //@@     batching. Default is 0.
+  //@@
+  uint64 max_queue_delay_microseconds = 2;
+  //@@  .. cpp:var:: bool preserve_ordering
+  //@@
+  //@@     Should the dynamic batcher preserve the ordering of responses to
+  //@@     match the order of requests received by the scheduler. Default is
+  //@@     false. If true, the responses will be returned in the same order as
+  //@@     the order of requests sent to the scheduler. If false, the responses
+  //@@     may be returned in arbitrary order. This option is specifically
+  //@@     needed when a sequence of related inference requests (i.e. inference
+  //@@     requests with the same correlation ID) are sent to the dynamic
+  //@@     batcher to ensure that the sequence responses are in the correct
+  //@@     order.
+  //@@
+  bool preserve_ordering = 3;
+  //@@  .. cpp:var:: uint32 priority_levels
+  //@@
+  //@@     The number of priority levels to be enabled for the model,
+  //@@     the priority level starts from 1 and 1 is the highest priority.
+  //@@     Requests are handled in priority order with all priority 1 requests
+  //@@     processed before priority 2, all priority 2 requests processed before
+  //@@     priority 3, etc. Requests with the same priority level will be
+  //@@     handled in the order that they are received.
+  //@@
+  uint32 priority_levels = 4;
+  //@@  .. cpp:var:: uint32 default_priority_level
+  //@@
+  //@@     The priority level used for requests that don't specify their
+  //@@     priority. The value must be in the range [ 1, 'priority_levels' ].
+  //@@
+  uint32 default_priority_level = 5;
+  //@@  .. cpp:var:: ModelQueuePolicy default_queue_policy
+  //@@
+  //@@     The default queue policy used for requests that don't require
+  //@@     priority handling and requests that specify priority levels where
+  //@@     there is no specific policy given. If not specified, a policy with
+  //@@     default field values will be used.
+  //@@
+  ModelQueuePolicy default_queue_policy = 6;
+  //@@  .. cpp:var:: map<uint32, ModelQueuePolicy> priority_queue_policy
+  //@@
+  //@@     Specify the queue policy for the priority level. The default queue
+  //@@     policy will be used if a priority level doesn't specify a queue
+  //@@     policy.
+  //@@
+  map<uint32, ModelQueuePolicy> priority_queue_policy = 7;
+}
+//@@
+//@@.. cpp:var:: message ModelSequenceBatching
+//@@
+//@@   Sequence batching configuration. These settings control how sequence
+//@@   batching operates for the model.
+//@@
+message ModelSequenceBatching
+{
+  //@@  .. cpp:var:: message Control
+  //@@
+  //@@     A control is a signal that the sequence batcher uses to
+  //@@     communicate with a backend.
+  //@@
+  message Control
+  {
+    //@@
+    //@@    .. cpp:enum:: Kind
+    //@@
+    //@@       The kind of the control.
+    //@@
+    enum Kind {
+      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_START = 0
+      //@@
+      //@@         A new sequence is/is-not starting. If true a sequence is
+      //@@         starting, if false a sequence is continuing. Must
+      //@@         specify either int32_false_true, fp32_false_true or
+      //@@         bool_false_true for this control. This control is optional.
+      //@@
+      CONTROL_SEQUENCE_START = 0;
+      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_READY = 1
+      //@@
+      //@@         A sequence is/is-not ready for inference. If true the
+      //@@         input tensor data is valid and should be used. If false
+      //@@         the input tensor data is invalid and inferencing should
+      //@@         be "skipped". Must specify either int32_false_true,
+      //@@         fp32_false_true or bool_false_true for this control. This
+      //@@         control is optional.
+      //@@
+      CONTROL_SEQUENCE_READY = 1;
+      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_END = 2
+      //@@
+      //@@         A sequence is/is-not ending. If true a sequence is
+      //@@         ending, if false a sequence is continuing. Must specify
+      //@@         either int32_false_true, fp32_false_true or bool_false_true
+      //@@         for this control. This control is optional.
+      //@@
+      CONTROL_SEQUENCE_END = 2;
+      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_CORRID = 3
+      //@@
+      //@@         The correlation ID of the sequence. The correlation ID
+      //@@         is an uint64_t value that is communicated in whole or
+      //@@         in part by the tensor. The tensor's datatype must be
+      //@@         specified by data_type and must be TYPE_UINT64, TYPE_INT64,
+      //@@         TYPE_UINT32 or TYPE_INT32. If a 32-bit datatype is specified
+      //@@         the correlation ID will be truncated to the low-order 32
+      //@@         bits. This control is optional.
+      //@@
+      CONTROL_SEQUENCE_CORRID = 3;
+    }
+    //@@    .. cpp:var:: Kind kind
+    //@@
+    //@@       The kind of this control.
+    //@@
+    Kind kind = 1;
+    //@@    .. cpp:var:: int32 int32_false_true (repeated)
+    //@@
+    //@@       The control's true and false setting is indicated by setting
+    //@@       a value in an int32 tensor. The tensor must be a
+    //@@       1-dimensional tensor with size equal to the batch size of
+    //@@       the request. 'int32_false_true' must have two entries: the
+    //@@       first the false value and the second the true value.
+    //@@
+    repeated int32 int32_false_true = 2;
+    //@@    .. cpp:var:: float fp32_false_true (repeated)
+    //@@
+    //@@       The control's true and false setting is indicated by setting
+    //@@       a value in a fp32 tensor. The tensor must be a
+    //@@       1-dimensional tensor with size equal to the batch size of
+    //@@       the request. 'fp32_false_true' must have two entries: the
+    //@@       first the false value and the second the true value.
+    //@@
+    repeated float fp32_false_true = 3;
+    //@@    .. cpp:var:: bool bool_false_true (repeated)
+    //@@
+    //@@       The control's true and false setting is indicated by setting
+    //@@       a value in a bool tensor. The tensor must be a
+    //@@       1-dimensional tensor with size equal to the batch size of
+    //@@       the request. 'bool_false_true' must have two entries: the
+    //@@       first the false value and the second the true value.
+    //@@
+    repeated bool bool_false_true = 5;
+    //@@    .. cpp:var:: DataType data_type
+    //@@
+    //@@       The control's datatype.
+    //@@
+    DataType data_type = 4;
+  }
+  //@@  .. cpp:var:: message ControlInput
+  //@@
+  //@@     The sequence control values to communicate by a model input.
+  //@@
+  message ControlInput
+  {
+    //@@    .. cpp:var:: string name
+    //@@
+    //@@       The name of the model input.
+    //@@
+    string name = 1;
+    //@@    .. cpp:var:: Control control (repeated)
+    //@@
+    //@@       The control value(s) that should be communicated to the
+    //@@       model using this model input.
+    //@@
+    repeated Control control = 2;
+  }
+  //@@
+  //@@  .. cpp:var:: message InitialState
+  //@@
+  //@@     Settings used to initialize data for implicit state.
+  //@@
+  message InitialState
+  {
+    //@@      .. cpp:var:: DataType data_type
+    //@@
+    //@@         The data-type of the state.
+    //@@
+    DataType data_type = 1;
+    //@@      .. cpp:var:: int64 dims (repeated)
+    //@@
+    //@@         The shape of the state tensor, not including the batch dimension.
+    //@@
+    repeated int64 dims = 2;
+    //@@      .. cpp:var:: oneof state_data
+    //@@
+    //@@         Specify how the initial state data is generated.
+    //@@
+    oneof state_data
+    {
+      //@@
+      //@@      .. cpp:var:: bool zero_data
+      //@@
+      //@@         The identifier for using zeros as initial state data.
+      //@@         Note that the value of 'zero_data' will not be checked,
+      //@@         instead, zero data will be used as long as the field is set.
+      //@@
+      bool zero_data = 3;
+      //@@      .. cpp:var:: string data_file
+      //@@
+      //@@         The file whose content will be used as the initial data for
+      //@@         the state in row-major order. The file must be provided in
+      //@@         sub-directory 'initial_state' under the model directory.
+      //@@
+      string data_file = 4;
+    }
+    //@@  .. cpp:var:: string name
+    //@@
+    //@@     The name of the state initialization.
+    //@@
+    string name = 5;
+  }
+  //@@  .. cpp:var:: message State
+  //@@
+  //@@     An input / output pair of tensors that carry state for the sequence.
+  //@@
+  message State
+  {
+    //@@    .. cpp:var:: string input_name
+    //@@
+    //@@       The name of the model state input.
+    //@@
+    string input_name = 1;
+    //@@    .. cpp:var:: string output_name
+    //@@
+    //@@       The name of the model state output.
+    //@@
+    string output_name = 2;
+    //@@    .. cpp:var:: DataType data_type
+    //@@
+    //@@       The data-type of the state.
+    //@@
+    DataType data_type = 3;
+    //@@    .. cpp:var:: int64 dim (repeated)
+    //@@
+    //@@       The dimension.
+    //@@
+    repeated int64 dims = 4;
+    //@@  .. cpp:var:: InitialState initial_state (repeated)
+    //@@
+    //@@     The optional field to specify the initial state for the model.
+    //@@
+    repeated InitialState initial_state = 5;
+  }
+  //@@  .. cpp:var:: message StrategyDirect
+  //@@
+  //@@     The sequence batcher uses a specific, unique batch
+  //@@     slot for each sequence. All inference requests in a
+  //@@     sequence are directed to the same batch slot in the same
+  //@@     model instance over the lifetime of the sequence. This
+  //@@     is the default strategy.
+  //@@
+  message StrategyDirect
+  {
+    //@@    .. cpp:var:: uint64 max_queue_delay_microseconds
+    //@@
+    //@@       The maximum time, in microseconds, a candidate request
+    //@@       will be delayed in the sequence batch scheduling queue to
+    //@@       wait for additional requests for batching. Default is 0.
+    //@@
+    uint64 max_queue_delay_microseconds = 1;
+    //@@    .. cpp:var:: float minimum_slot_utilization
+    //@@
+    //@@       The minimum slot utilization that must be satisfied to
+    //@@       execute the batch before 'max_queue_delay_microseconds' expires.
+    //@@       For example, a value of 0.5 indicates that the batch should be
+    //@@       executed as soon as 50% or more of the slots are ready even if
+    //@@       the 'max_queue_delay_microseconds' timeout has not expired.
+    //@@       The default is 0.0, indicating that a batch will be executed
+    //@@       before 'max_queue_delay_microseconds' timeout expires if at least
+    //@@       one batch slot is ready. 'max_queue_delay_microseconds' will be
+    //@@       ignored unless minimum_slot_utilization is set to a non-zero
+    //@@       value.
+    //@@
+    float minimum_slot_utilization = 2;
+  }
+  //@@  .. cpp:var:: message StrategyOldest
+  //@@
+  //@@     The sequence batcher maintains up to 'max_candidate_sequences'
+  //@@     candidate sequences. 'max_candidate_sequences' can be greater
+  //@@     than the model's 'max_batch_size'. For inferencing the batcher
+  //@@     chooses from the candidate sequences up to 'max_batch_size'
+  //@@     inference requests. Requests are chosen in an oldest-first
+  //@@     manner across all candidate sequences. A given sequence is
+  //@@     not guaranteed to be assigned to the same batch slot for
+  //@@     all inference requests of that sequence.
+  //@@
+  message StrategyOldest
+  {
+    //@@    .. cpp:var:: int32 max_candidate_sequences
+    //@@
+    //@@       Maximum number of candidate sequences that the batcher
+    //@@       maintains. Excess seqences are kept in an ordered backlog
+    //@@       and become candidates when existing candidate sequences
+    //@@       complete.
+    //@@
+    int32 max_candidate_sequences = 1;
+    //@@    .. cpp:var:: int32 preferred_batch_size (repeated)
+    //@@
+    //@@       Preferred batch sizes for dynamic batching of candidate
+    //@@       sequences. If a batch of one of these sizes can be formed
+    //@@       it will be executed immediately. If not specified a
+    //@@       preferred batch size will be chosen automatically
+    //@@       based on model and GPU characteristics.
+    //@@
+    repeated int32 preferred_batch_size = 2;
+    //@@    .. cpp:var:: uint64 max_queue_delay_microseconds
+    //@@
+    //@@       The maximum time, in microseconds, a candidate request
+    //@@       will be delayed in the dynamic batch scheduling queue to
+    //@@       wait for additional requests for batching. Default is 0.
+    //@@
+    uint64 max_queue_delay_microseconds = 3;
+  }
+  //@@  .. cpp:var:: oneof strategy_choice
+  //@@
+  //@@     The strategy used by the sequence batcher. Default strategy
+  //@@     is 'direct'.
+  //@@
+  oneof strategy_choice
+  {
+    //@@    .. cpp:var:: StrategyDirect direct
+    //@@
+    //@@       StrategyDirect scheduling strategy.
+    //@@
+    StrategyDirect direct = 3;
+    //@@    .. cpp:var:: StrategyOldest oldest
+    //@@
+    //@@       StrategyOldest scheduling strategy.
+    //@@
+    StrategyOldest oldest = 4;
+  }
+  //@@  .. cpp:var:: uint64 max_sequence_idle_microseconds
+  //@@
+  //@@     The maximum time, in microseconds, that a sequence is allowed to
+  //@@     be idle before it is aborted. The inference server considers a
+  //@@     sequence idle when it does not have any inference request queued
+  //@@     for the sequence. If this limit is exceeded, the inference server
+  //@@     will free the sequence slot allocated by the sequence and make it
+  //@@     available for another sequence. If not specified (or specified as
+  //@@     zero) a default value of 1000000 (1 second) is used.
+  //@@
+  uint64 max_sequence_idle_microseconds = 1;
+  //@@  .. cpp:var:: ControlInput control_input (repeated)
+  //@@
+  //@@     The model input(s) that the server should use to communicate
+  //@@     sequence start, stop, ready and similar control values to the
+  //@@     model.
+  //@@
+  repeated ControlInput control_input = 2;
+  //@@  .. cpp:var:: State state (repeated)
+  //@@
+  //@@     The optional state that can be stored in Triton for performing
+  //@@     inference requests on a sequence. Each sequence holds an implicit
+  //@@     state local to itself. The output state tensor provided by the
+  //@@     model in 'output_name' field of the current inference request will
+  //@@     be transferred as an input tensor named 'input_name' in the next
+  //@@     request of the same sequence. The input state of the first request
+  //@@     in the sequence contains garbage data.
+  //@@
+  repeated State state = 5;
+}
+//@@
+//@@.. cpp:var:: message ModelEnsembling
+//@@
+//@@   Model ensembling configuration. These settings specify the models that
+//@@   compose the ensemble and how data flows between the models.
+//@@
+message ModelEnsembling
+{
+  //@@  .. cpp:var:: message Step
+  //@@
+  //@@     Each step specifies a model included in the ensemble,
+  //@@     maps ensemble tensor names to the model input tensors,
+  //@@     and maps model output tensors to ensemble tensor names
+  //@@
+  message Step
+  {
+    //@@  .. cpp:var:: string model_name
+    //@@
+    //@@     The name of the model to execute for this step of the ensemble.
+    //@@
+    string model_name = 1;
+    //@@  .. cpp:var:: int64 model_version
+    //@@
+    //@@     The version of the model to use for inference. If -1
+    //@@     the latest/most-recent version of the model is used.
+    //@@
+    int64 model_version = 2;
+    //@@  .. cpp:var:: map<string,string> input_map
+    //@@
+    //@@     Map from name of an input tensor on this step's model to ensemble
+    //@@     tensor name. The ensemble tensor must have the same data type and
+    //@@     shape as the model input. Each model input must be assigned to
+    //@@     one ensemble tensor, but the same ensemble tensor can be assigned
+    //@@     to multiple model inputs.
+    //@@
+    map<string, string> input_map = 3;
+    //@@  .. cpp:var:: map<string,string> output_map
+    //@@
+    //@@     Map from name of an output tensor on this step's model to ensemble
+    //@@     tensor name. The data type and shape of the ensemble tensor will
+    //@@     be inferred from the model output. It is optional to assign all
+    //@@     model outputs to ensemble tensors. One ensemble tensor name
+    //@@     can appear in an output map only once.
+    //@@
+    map<string, string> output_map = 4;
+  }
+  //@@  .. cpp:var:: Step step (repeated)
+  //@@
+  //@@     The models and the input / output mappings used within the ensemble.
+  //@@
+  repeated Step step = 1;
+}
+//@@
+//@@.. cpp:var:: message ModelParameter
+//@@
+//@@   A model parameter.
+//@@
+message ModelParameter
+{
+  //@@  .. cpp:var:: string string_value
+  //@@
+  //@@     The string value of the parameter.
+  //@@
+  string string_value = 1;
+}
+//@@
+//@@.. cpp:var:: message ModelWarmup
+//@@
+//@@   Settings used to construct the request sample for model warmup.
+//@@
+message ModelWarmup
+{
+  //@@
+  //@@  .. cpp:var:: message Input
+  //@@
+  //@@     Meta data associated with an input.
+  //@@
+  message Input
+  {
+    //@@    .. cpp:var:: DataType data_type
+    //@@
+    //@@       The data-type of the input.
+    //@@
+    DataType data_type = 1;
+    //@@    .. cpp:var:: int64 dims (repeated)
+    //@@
+    //@@       The shape of the input tensor, not including the batch dimension.
+    //@@
+    repeated int64 dims = 2;
+    //@@    .. cpp:var:: oneof input_data_type
+    //@@
+    //@@       Specify how the input data is generated. If the input has STRING
+    //@@       data type and 'random_data' is set, the data generation will fall
+    //@@       back to 'zero_data'.
+    //@@
+    oneof input_data_type
+    {
+      //@@
+      //@@    .. cpp:var:: bool zero_data
+      //@@
+      //@@       The identifier for using zeros as input data. Note that the
+      //@@       value of 'zero_data' will not be checked, instead, zero data
+      //@@       will be used as long as the field is set.
+      //@@
+      bool zero_data = 3;
+      //@@
+      //@@    .. cpp:var:: bool random_data
+      //@@
+      //@@       The identifier for using random data as input data. Note that
+      //@@       the value of 'random_data' will not be checked, instead,
+      //@@       random data will be used as long as the field is set.
+      //@@
+      bool random_data = 4;
+      //@@    .. cpp:var:: string input_data_file
+      //@@
+      //@@       The file whose content will be used as raw input data in
+      //@@       row-major order. The file must be provided in a sub-directory
+      //@@       'warmup' under the model directory. The file contents should be
+      //@@       in binary format. For TYPE_STRING data-type, an element is
+      //@@       represented by a 4-byte unsigned integer giving the length 
+      //@@       followed by the actual bytes.
+      //@@
+      string input_data_file = 5;
+    }
+  }
+  //@@  .. cpp:var:: string name
+  //@@
+  //@@     The name of the request sample.
+  //@@
+  string name = 1;
+  //@@  .. cpp:var:: uint32 batch_size
+  //@@
+  //@@     The batch size of the inference request. This must be >= 1. For
+  //@@     models that don't support batching, batch_size must be 1. If
+  //@@     batch_size > 1, the 'inputs' specified below will be duplicated to
+  //@@     match the batch size requested.
+  //@@
+  uint32 batch_size = 2;
+  //@@  .. cpp:var:: map<string, Input> inputs
+  //@@
+  //@@     The warmup meta data associated with every model input, including
+  //@@     control tensors.
+  //@@
+  map<string, Input> inputs = 3;
+  //@@  .. cpp:var:: uint32 count
+  //@@
+  //@@     The number of iterations that this warmup sample will be executed.
+  //@@     For example, if this field is set to 2, 2 model executions using this
+  //@@     sample will be scheduled for warmup. Default value is 0 which
+  //@@     indicates that this sample will be used only once.
+  //@@     Note that for sequence model, 'count' may not work well
+  //@@     because the model often expect a valid sequence of requests which
+  //@@     should be represented by a series of warmup samples. 'count > 1'
+  //@@     essentially "resends" one of the sample, which may invalidate the
+  //@@     sequence and result in unexpected warmup failure.
+  //@@
+  uint32 count = 4;
+}
+//@@
+//@@ .. cpp:var:: message ModelOperations
+//@@
+//@@    The metadata of libraries providing custom operations for this model.
+//@@
+message ModelOperations
+{
+  //@@  .. cpp:var:: string op_library_filename (repeated)
+  //@@
+  //@@     Optional paths of the libraries providing custom operations for
+  //@@     this model. Valid only for ONNX models.
+  //@@
+  repeated string op_library_filename = 1;
+}
+//@@
+//@@ .. cpp:var:: message ModelTransactionPolicy
+//@@
+//@@    The specification that describes the nature of transactions
+//@@    to be expected from the model.
+//@@
+message ModelTransactionPolicy
+{
+  //@@  .. cpp:var:: bool decoupled
+  //@@
+  //@@     Indicates whether responses generated by the model are decoupled with
+  //@@     the requests issued to it, which means the number of responses
+  //@@     generated by model may differ from number of requests issued, and
+  //@@     that the responses may be out of order relative to the order of
+  //@@     requests. The default is false, which means the model will generate
+  //@@     exactly one response for each request.
+  //@@
+  bool decoupled = 1;
+}
+//@@
+//@@.. cpp:var:: message ModelRepositoryAgents
+//@@
+//@@   The repository agents for the model.
+//@@
+message ModelRepositoryAgents
+{
+  //@@
+  //@@  .. cpp:var:: message Agent
+  //@@
+  //@@     A repository agent that should be invoked for the specified
+  //@@     repository actions for this model.
+  //@@
+  message Agent
+  {
+    //@@    .. cpp:var:: string name
+    //@@
+    //@@       The name of the agent.
+    //@@
+    string name = 1;
+    //@@    .. cpp:var:: map<string, string> parameters
+    //@@
+    //@@       The parameters for the agent.
+    //@@
+    map<string, string> parameters = 2;
+  }
+  //@@
+  //@@  .. cpp:var:: Agent agents (repeated)
+  //@@
+  //@@     The ordered list of agents for the model. These agents will be
+  //@@     invoked in order to respond to repository actions occuring for the
+  //@@     model.
+  //@@
+  repeated Agent agents = 1;
+}
+//@@
+//@@.. cpp:var:: message ModelResponseCache
+//@@
+//@@   The response cache setting for the model.
+//@@
+message ModelResponseCache
+{
+  //@@
+  //@@  .. cpp::var:: bool enable
+  //@@
+  //@@     Whether or not to use response cache for the model. If True, the
+  //@@     responses from the model are cached and when identical request
+  //@@     is encountered, instead of going through the model execution,
+  //@@     the response from the cache is utilized. By default, response
+  //@@     cache is disabled for the models.
+  //@@
+  bool enable = 1;
+}
+//@@
+//@@.. cpp:var:: message ModelConfig
+//@@
+//@@   A model configuration.
+//@@
+message ModelConfig
+{
+  //@@  .. cpp:var:: string name
+  //@@
+  //@@     The name of the model.
+  //@@
+  string name = 1;
+  //@@  .. cpp:var:: string platform
+  //@@
+  //@@     The framework for the model. Possible values are
+  //@@     "tensorrt_plan", "tensorflow_graphdef",
+  //@@     "tensorflow_savedmodel", "onnxruntime_onnx",
+  //@@     "pytorch_libtorch".
+  //@@
+  string platform = 2;
+  //@@  .. cpp:var:: string backend
+  //@@
+  //@@     The backend used by the model.
+  //@@
+  string backend = 17;
+  //@@  .. cpp:var:: ModelVersionPolicy version_policy
+  //@@
+  //@@     Policy indicating which version(s) of the model will be served.
+  //@@
+  ModelVersionPolicy version_policy = 3;
+  //@@  .. cpp:var:: int32 max_batch_size
+  //@@
+  //@@     Maximum batch size allowed for inference. This can only decrease
+  //@@     what is allowed by the model itself. A max_batch_size value of 0
+  //@@     indicates that batching is not allowed for the model and the
+  //@@     dimension/shape of the input and output tensors must exactly
+  //@@     match what is specified in the input and output configuration. A
+  //@@     max_batch_size value > 0 indicates that batching is allowed and
+  //@@     so the model expects the input tensors to have an additional
+  //@@     initial dimension for the batching that is not specified in the
+  //@@     input (for example, if the model supports batched inputs of
+  //@@     2-dimensional tensors then the model configuration will specify
+  //@@     the input shape as [ X, Y ] but the model will expect the actual
+  //@@     input tensors to have shape [ N, X, Y ]). For max_batch_size > 0
+  //@@     returned outputs will also have an additional initial dimension
+  //@@     for the batch.
+  //@@
+  int32 max_batch_size = 4;
+  //@@  .. cpp:var:: ModelInput input (repeated)
+  //@@
+  //@@     The inputs request by the model.
+  //@@
+  repeated ModelInput input = 5;
+  //@@  .. cpp:var:: ModelOutput output (repeated)
+  //@@
+  //@@     The outputs produced by the model.
+  //@@
+  repeated ModelOutput output = 6;
+  //@@  .. cpp:var:: BatchInput batch_input (repeated)
+  //@@
+  //@@     The model input(s) that the server should use to communicate
+  //@@     batch related values to the model.
+  //@@
+  repeated BatchInput batch_input = 20;
+  //@@  .. cpp:var:: BatchOutput batch_output (repeated)
+  //@@
+  //@@     The outputs produced by the model that requires special handling
+  //@@     by the model backend.
+  //@@
+  repeated BatchOutput batch_output = 21;
+  //@@  .. cpp:var:: ModelOptimizationPolicy optimization
+  //@@
+  //@@     Optimization configuration for the model. If not specified
+  //@@     then default optimization policy is used.
+  //@@
+  ModelOptimizationPolicy optimization = 12;
+  //@@  .. cpp:var:: oneof scheduling_choice
+  //@@
+  //@@     The scheduling policy for the model. If not specified the
+  //@@     default scheduling policy is used for the model. The default
+  //@@     policy is to execute each inference request independently.
+  //@@
+  oneof scheduling_choice
+  {
+    //@@    .. cpp:var:: ModelDynamicBatching dynamic_batching
+    //@@
+    //@@       If specified, enables the dynamic-batching scheduling
+    //@@       policy. With dynamic-batching the scheduler may group
+    //@@       together independent requests into a single batch to
+    //@@       improve inference throughput.
+    //@@
+    ModelDynamicBatching dynamic_batching = 11;
+    //@@    .. cpp:var:: ModelSequenceBatching sequence_batching
+    //@@
+    //@@       If specified, enables the sequence-batching scheduling
+    //@@       policy. With sequence-batching, inference requests
+    //@@       with the same correlation ID are routed to the same
+    //@@       model instance. Multiple sequences of inference requests
+    //@@       may be batched together into a single batch to
+    //@@       improve inference throughput.
+    //@@
+    ModelSequenceBatching sequence_batching = 13;
+    //@@    .. cpp:var:: ModelEnsembling ensemble_scheduling
+    //@@
+    //@@       If specified, enables the model-ensembling scheduling
+    //@@       policy. With model-ensembling, inference requests
+    //@@       will be processed according to the specification, such as an
+    //@@       execution sequence of models. The input specified in this model
+    //@@       config will be the input for the ensemble, and the output
+    //@@       specified will be the output of the ensemble.
+    //@@
+    ModelEnsembling ensemble_scheduling = 15;
+  }
+  //@@  .. cpp:var:: ModelInstanceGroup instance_group (repeated)
+  //@@
+  //@@     Instances of this model. If not specified, one instance
+  //@@     of the model will be instantiated on each available GPU.
+  //@@
+  repeated ModelInstanceGroup instance_group = 7;
+  //@@  .. cpp:var:: string default_model_filename
+  //@@
+  //@@     Optional filename of the model file to use if a
+  //@@     compute-capability specific model is not specified in
+  //@@     :cpp:var:`cc_model_filenames`. If not specified the default name
+  //@@     is 'model.graphdef', 'model.savedmodel', 'model.plan' or
+  //@@     'model.pt' depending on the model type.
+  //@@
+  string default_model_filename = 8;
+  //@@  .. cpp:var:: map<string,string> cc_model_filenames
+  //@@
+  //@@     Optional map from CUDA compute capability to the filename of
+  //@@     the model that supports that compute capability. The filename
+  //@@     refers to a file within the model version directory.
+  //@@
+  map<string, string> cc_model_filenames = 9;
+  //@@  .. cpp:var:: map<string,string> metric_tags
+  //@@
+  //@@     Optional metric tags. User-specific key-value pairs for metrics
+  //@@     reported for this model. These tags are applied to the metrics
+  //@@     reported on the HTTP metrics port.
+  //@@
+  map<string, string> metric_tags = 10;
+  //@@  .. cpp:var:: map<string,ModelParameter> parameters
+  //@@
+  //@@     Optional model parameters. User-specified parameter values.
+  //@@
+  map<string, ModelParameter> parameters = 14;
+  //@@  .. cpp:var:: ModelWarmup model_warmup (repeated)
+  //@@
+  //@@     Warmup setting of this model. If specified, all instances
+  //@@     will be run with the request samples in sequence before
+  //@@     serving the model.
+  //@@     This field can only be specified if the model is not an ensemble
+  //@@     model.
+  //@@
+  repeated ModelWarmup model_warmup = 16;
+  //@@  .. cpp:var:: ModelOperations model_operations
+  //@@
+  //@@     Optional metadata of the libraries providing custom operations for
+  //@@     this model.
+  //@@
+  ModelOperations model_operations = 18;
+  //@@  .. cpp:var:: ModelTransactionPolicy model_transaction_policy
+  //@@
+  //@@     Optional specification that describes the nature of transactions
+  //@@     to be expected from the model.
+  //@@
+  ModelTransactionPolicy model_transaction_policy = 19;
+  //@@  .. cpp:var:: ModelRepositoryAgents model_repository_agents
+  //@@
+  //@@     Optional specification of the agent(s) that should be invoked
+  //@@     with repository actions are performed for this model.
+  //@@
+  ModelRepositoryAgents model_repository_agents = 23;
+  //@@  .. cpp:var:: ModelResponseCache response_cache
+  //@@
+  //@@     Optional setting for utilizing the response cache for this
+  //@@     model.
+  //@@
+  ModelResponseCache response_cache = 24;
+}
\ No newline at end of file
--- a/visualdl/component/inference/proto/model_config_pb2.py
+++ b/visualdl/component/inference/proto/model_config_pb2.py
+# flake8: noqa
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: model_config.protxt
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import enum_type_wrapper
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x13model_config.protxt\x12\tinference\"\x96\x01\n\x10ModelRateLimiter\x12\x37\n\tresources\x18\x01 \x03(\x0b\x32$.inference.ModelRateLimiter.Resource\x12\x10\n\x08priority\x18\x02 \x01(\r\x1a\x37\n\x08Resource\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0e\n\x06global\x18\x02 \x01(\x08\x12\r\n\x05\x63ount\x18\x03 \x01(\r\"\x87\x04\n\x12ModelInstanceGroup\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x30\n\x04kind\x18\x04 \x01(\x0e\x32\".inference.ModelInstanceGroup.Kind\x12\r\n\x05\x63ount\x18\x02 \x01(\x05\x12\x31\n\x0crate_limiter\x18\x06 \x01(\x0b\x32\x1b.inference.ModelRateLimiter\x12\x0c\n\x04gpus\x18\x03 \x03(\x05\x12H\n\x11secondary_devices\x18\x08 \x03(\x0b\x32-.inference.ModelInstanceGroup.SecondaryDevice\x12\x0f\n\x07profile\x18\x05 \x03(\t\x12\x0f\n\x07passive\x18\x07 \x01(\x08\x12\x13\n\x0bhost_policy\x18\t \x01(\t\x1a\x9c\x01\n\x0fSecondaryDevice\x12O\n\x04kind\x18\x01 \x01(\x0e\x32\x41.inference.ModelInstanceGroup.SecondaryDevice.SecondaryDeviceKind\x12\x11\n\tdevice_id\x18\x02 \x01(\x03\"%\n\x13SecondaryDeviceKind\x12\x0e\n\nKIND_NVDLA\x10\x00\"A\n\x04Kind\x12\r\n\tKIND_AUTO\x10\x00\x12\x0c\n\x08KIND_GPU\x10\x01\x12\x0c\n\x08KIND_CPU\x10\x02\x12\x0e\n\nKIND_MODEL\x10\x03\"#\n\x12ModelTensorReshape\x12\r\n\x05shape\x18\x01 \x03(\x03\"\xb2\x02\n\nModelInput\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\tdata_type\x18\x02 \x01(\x0e\x32\x13.inference.DataType\x12,\n\x06\x66ormat\x18\x03 \x01(\x0e\x32\x1c.inference.ModelInput.Format\x12\x0c\n\x04\x64ims\x18\x04 \x03(\x03\x12.\n\x07reshape\x18\x05 \x01(\x0b\x32\x1d.inference.ModelTensorReshape\x12\x17\n\x0fis_shape_tensor\x18\x06 \x01(\x08\x12\x1a\n\x12\x61llow_ragged_batch\x18\x07 \x01(\x08\x12\x10\n\x08optional\x18\x08 \x01(\x08\";\n\x06\x46ormat\x12\x0f\n\x0b\x46ORMAT_NONE\x10\x00\x12\x0f\n\x0b\x46ORMAT_NHWC\x10\x01\x12\x0f\n\x0b\x46ORMAT_NCHW\x10\x02\"\xb2\x01\n\x0bModelOutput\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\tdata_type\x18\x02 \x01(\x0e\x32\x13.inference.DataType\x12\x0c\n\x04\x64ims\x18\x03 \x03(\x03\x12.\n\x07reshape\x18\x05 \x01(\x0b\x32\x1d.inference.ModelTensorReshape\x12\x16\n\x0elabel_filename\x18\x04 \x01(\t\x12\x17\n\x0fis_shape_tensor\x18\x06 \x01(\x08\"\xd9\x02\n\nBatchInput\x12(\n\x04kind\x18\x01 \x01(\x0e\x32\x1a.inference.BatchInput.Kind\x12\x13\n\x0btarget_name\x18\x02 \x03(\t\x12&\n\tdata_type\x18\x03 \x01(\x0e\x32\x13.inference.DataType\x12\x14\n\x0csource_input\x18\x04 \x03(\t\"\xcd\x01\n\x04Kind\x12\x17\n\x13\x42\x41TCH_ELEMENT_COUNT\x10\x00\x12#\n\x1f\x42\x41TCH_ACCUMULATED_ELEMENT_COUNT\x10\x01\x12-\n)BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO\x10\x02\x12$\n BATCH_MAX_ELEMENT_COUNT_AS_SHAPE\x10\x03\x12\x14\n\x10\x42\x41TCH_ITEM_SHAPE\x10\x04\x12\x1c\n\x18\x42\x41TCH_ITEM_SHAPE_FLATTEN\x10\x05\"\x8f\x01\n\x0b\x42\x61tchOutput\x12\x13\n\x0btarget_name\x18\x01 \x03(\t\x12)\n\x04kind\x18\x02 \x01(\x0e\x32\x1b.inference.BatchOutput.Kind\x12\x14\n\x0csource_input\x18\x03 \x03(\t\"*\n\x04Kind\x12\"\n\x1e\x42\x41TCH_SCATTER_WITH_INPUT_SHAPE\x10\x00\"\x90\x02\n\x12ModelVersionPolicy\x12\x36\n\x06latest\x18\x01 \x01(\x0b\x32$.inference.ModelVersionPolicy.LatestH\x00\x12\x30\n\x03\x61ll\x18\x02 \x01(\x0b\x32!.inference.ModelVersionPolicy.AllH\x00\x12:\n\x08specific\x18\x03 \x01(\x0b\x32&.inference.ModelVersionPolicy.SpecificH\x00\x1a\x1e\n\x06Latest\x12\x14\n\x0cnum_versions\x18\x01 \x01(\r\x1a\x05\n\x03\x41ll\x1a\x1c\n\x08Specific\x12\x10\n\x08versions\x18\x01 \x03(\x03\x42\x0f\n\rpolicy_choice\"\xfd\r\n\x17ModelOptimizationPolicy\x12\x37\n\x05graph\x18\x01 \x01(\x0b\x32(.inference.ModelOptimizationPolicy.Graph\x12\x42\n\x08priority\x18\x02 \x01(\x0e\x32\x30.inference.ModelOptimizationPolicy.ModelPriority\x12\x35\n\x04\x63uda\x18\x03 \x01(\x0b\x32\'.inference.ModelOptimizationPolicy.Cuda\x12X\n\x16\x65xecution_accelerators\x18\x04 \x01(\x0b\x32\x38.inference.ModelOptimizationPolicy.ExecutionAccelerators\x12R\n\x13input_pinned_memory\x18\x05 \x01(\x0b\x32\x35.inference.ModelOptimizationPolicy.PinnedMemoryBuffer\x12S\n\x14output_pinned_memory\x18\x06 \x01(\x0b\x32\x35.inference.ModelOptimizationPolicy.PinnedMemoryBuffer\x12&\n\x1egather_kernel_buffer_threshold\x18\x07 \x01(\r\x12\x16\n\x0e\x65\x61ger_batching\x18\x08 \x01(\x08\x1a\x16\n\x05Graph\x12\r\n\x05level\x18\x01 \x01(\x05\x1a\xba\x05\n\x04\x43uda\x12\x0e\n\x06graphs\x18\x01 \x01(\x08\x12\x18\n\x10\x62usy_wait_events\x18\x02 \x01(\x08\x12\x45\n\ngraph_spec\x18\x03 \x03(\x0b\x32\x31.inference.ModelOptimizationPolicy.Cuda.GraphSpec\x12\x1a\n\x12output_copy_stream\x18\x04 \x01(\x08\x1a\xa4\x04\n\tGraphSpec\x12\x12\n\nbatch_size\x18\x01 \x01(\x05\x12K\n\x05input\x18\x02 \x03(\x0b\x32<.inference.ModelOptimizationPolicy.Cuda.GraphSpec.InputEntry\x12W\n\x11graph_lower_bound\x18\x03 \x01(\x0b\x32<.inference.ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound\x1a\x14\n\x05Shape\x12\x0b\n\x03\x64im\x18\x01 \x03(\x03\x1a\xdf\x01\n\nLowerBound\x12\x12\n\nbatch_size\x18\x01 \x01(\x05\x12V\n\x05input\x18\x02 \x03(\x0b\x32G.inference.ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound.InputEntry\x1a\x65\n\nInputEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x46\n\x05value\x18\x02 \x01(\x0b\x32\x37.inference.ModelOptimizationPolicy.Cuda.GraphSpec.Shape:\x02\x38\x01\x1a\x65\n\nInputEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x46\n\x05value\x18\x02 \x01(\x0b\x32\x37.inference.ModelOptimizationPolicy.Cuda.GraphSpec.Shape:\x02\x38\x01\x1a\xa4\x03\n\x15\x45xecutionAccelerators\x12g\n\x19gpu_execution_accelerator\x18\x01 \x03(\x0b\x32\x44.inference.ModelOptimizationPolicy.ExecutionAccelerators.Accelerator\x12g\n\x19\x63pu_execution_accelerator\x18\x02 \x03(\x0b\x32\x44.inference.ModelOptimizationPolicy.ExecutionAccelerators.Accelerator\x1a\xb8\x01\n\x0b\x41\x63\x63\x65lerator\x12\x0c\n\x04name\x18\x01 \x01(\t\x12h\n\nparameters\x18\x02 \x03(\x0b\x32T.inference.ModelOptimizationPolicy.ExecutionAccelerators.Accelerator.ParametersEntry\x1a\x31\n\x0fParametersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a$\n\x12PinnedMemoryBuffer\x12\x0e\n\x06\x65nable\x18\x01 \x01(\x08\"I\n\rModelPriority\x12\x14\n\x10PRIORITY_DEFAULT\x10\x00\x12\x10\n\x0cPRIORITY_MAX\x10\x01\x12\x10\n\x0cPRIORITY_MIN\x10\x02\"\xdb\x01\n\x10ModelQueuePolicy\x12\x41\n\x0etimeout_action\x18\x01 \x01(\x0e\x32).inference.ModelQueuePolicy.TimeoutAction\x12$\n\x1c\x64\x65\x66\x61ult_timeout_microseconds\x18\x02 \x01(\x04\x12\x1e\n\x16\x61llow_timeout_override\x18\x03 \x01(\x08\x12\x16\n\x0emax_queue_size\x18\x04 \x01(\r\"&\n\rTimeoutAction\x12\n\n\x06REJECT\x10\x00\x12\t\n\x05\x44\x45LAY\x10\x01\"\x9b\x03\n\x14ModelDynamicBatching\x12\x1c\n\x14preferred_batch_size\x18\x01 \x03(\x05\x12$\n\x1cmax_queue_delay_microseconds\x18\x02 \x01(\x04\x12\x19\n\x11preserve_ordering\x18\x03 \x01(\x08\x12\x17\n\x0fpriority_levels\x18\x04 \x01(\r\x12\x1e\n\x16\x64\x65\x66\x61ult_priority_level\x18\x05 \x01(\r\x12\x39\n\x14\x64\x65\x66\x61ult_queue_policy\x18\x06 \x01(\x0b\x32\x1b.inference.ModelQueuePolicy\x12W\n\x15priority_queue_policy\x18\x07 \x03(\x0b\x32\x38.inference.ModelDynamicBatching.PriorityQueuePolicyEntry\x1aW\n\x18PriorityQueuePolicyEntry\x12\x0b\n\x03key\x18\x01 \x01(\r\x12*\n\x05value\x18\x02 \x01(\x0b\x32\x1b.inference.ModelQueuePolicy:\x02\x38\x01\"\xef\t\n\x15ModelSequenceBatching\x12\x41\n\x06\x64irect\x18\x03 \x01(\x0b\x32/.inference.ModelSequenceBatching.StrategyDirectH\x00\x12\x41\n\x06oldest\x18\x04 \x01(\x0b\x32/.inference.ModelSequenceBatching.StrategyOldestH\x00\x12&\n\x1emax_sequence_idle_microseconds\x18\x01 \x01(\x04\x12\x44\n\rcontrol_input\x18\x02 \x03(\x0b\x32-.inference.ModelSequenceBatching.ControlInput\x12\x35\n\x05state\x18\x05 \x03(\x0b\x32&.inference.ModelSequenceBatching.State\x1a\xb1\x02\n\x07\x43ontrol\x12;\n\x04kind\x18\x01 \x01(\x0e\x32-.inference.ModelSequenceBatching.Control.Kind\x12\x18\n\x10int32_false_true\x18\x02 \x03(\x05\x12\x17\n\x0f\x66p32_false_true\x18\x03 \x03(\x02\x12\x17\n\x0f\x62ool_false_true\x18\x05 \x03(\x08\x12&\n\tdata_type\x18\x04 \x01(\x0e\x32\x13.inference.DataType\"u\n\x04Kind\x12\x1a\n\x16\x43ONTROL_SEQUENCE_START\x10\x00\x12\x1a\n\x16\x43ONTROL_SEQUENCE_READY\x10\x01\x12\x18\n\x14\x43ONTROL_SEQUENCE_END\x10\x02\x12\x1b\n\x17\x43ONTROL_SEQUENCE_CORRID\x10\x03\x1aW\n\x0c\x43ontrolInput\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x39\n\x07\x63ontrol\x18\x02 \x03(\x0b\x32(.inference.ModelSequenceBatching.Control\x1a\x8a\x01\n\x0cInitialState\x12&\n\tdata_type\x18\x01 \x01(\x0e\x32\x13.inference.DataType\x12\x0c\n\x04\x64ims\x18\x02 \x03(\x03\x12\x13\n\tzero_data\x18\x03 \x01(\x08H\x00\x12\x13\n\tdata_file\x18\x04 \x01(\tH\x00\x12\x0c\n\x04name\x18\x05 \x01(\tB\x0c\n\nstate_data\x1a\xac\x01\n\x05State\x12\x12\n\ninput_name\x18\x01 \x01(\t\x12\x13\n\x0boutput_name\x18\x02 \x01(\t\x12&\n\tdata_type\x18\x03 \x01(\x0e\x32\x13.inference.DataType\x12\x0c\n\x04\x64ims\x18\x04 \x03(\x03\x12\x44\n\rinitial_state\x18\x05 \x03(\x0b\x32-.inference.ModelSequenceBatching.InitialState\x1aX\n\x0eStrategyDirect\x12$\n\x1cmax_queue_delay_microseconds\x18\x01 \x01(\x04\x12 \n\x18minimum_slot_utilization\x18\x02 \x01(\x02\x1au\n\x0eStrategyOldest\x12\x1f\n\x17max_candidate_sequences\x18\x01 \x01(\x05\x12\x1c\n\x14preferred_batch_size\x18\x02 \x03(\x05\x12$\n\x1cmax_queue_delay_microseconds\x18\x03 \x01(\x04\x42\x11\n\x0fstrategy_choice\"\xdd\x02\n\x0fModelEnsembling\x12-\n\x04step\x18\x01 \x03(\x0b\x32\x1f.inference.ModelEnsembling.Step\x1a\x9a\x02\n\x04Step\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x15\n\rmodel_version\x18\x02 \x01(\x03\x12@\n\tinput_map\x18\x03 \x03(\x0b\x32-.inference.ModelEnsembling.Step.InputMapEntry\x12\x42\n\noutput_map\x18\x04 \x03(\x0b\x32..inference.ModelEnsembling.Step.OutputMapEntry\x1a/\n\rInputMapEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x30\n\x0eOutputMapEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"&\n\x0eModelParameter\x12\x14\n\x0cstring_value\x18\x01 \x01(\t\"\xd9\x02\n\x0bModelWarmup\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x02 \x01(\r\x12\x32\n\x06inputs\x18\x03 \x03(\x0b\x32\".inference.ModelWarmup.InputsEntry\x12\r\n\x05\x63ount\x18\x04 \x01(\r\x1a\x97\x01\n\x05Input\x12&\n\tdata_type\x18\x01 \x01(\x0e\x32\x13.inference.DataType\x12\x0c\n\x04\x64ims\x18\x02 \x03(\x03\x12\x13\n\tzero_data\x18\x03 \x01(\x08H\x00\x12\x15\n\x0brandom_data\x18\x04 \x01(\x08H\x00\x12\x19\n\x0finput_data_file\x18\x05 \x01(\tH\x00\x42\x11\n\x0finput_data_type\x1aK\n\x0bInputsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12+\n\x05value\x18\x02 \x01(\x0b\x32\x1c.inference.ModelWarmup.Input:\x02\x38\x01\".\n\x0fModelOperations\x12\x1b\n\x13op_library_filename\x18\x01 \x03(\t\"+\n\x16ModelTransactionPolicy\x12\x11\n\tdecoupled\x18\x01 \x01(\x08\"\xe6\x01\n\x15ModelRepositoryAgents\x12\x36\n\x06\x61gents\x18\x01 \x03(\x0b\x32&.inference.ModelRepositoryAgents.Agent\x1a\x94\x01\n\x05\x41gent\x12\x0c\n\x04name\x18\x01 \x01(\t\x12J\n\nparameters\x18\x02 \x03(\x0b\x32\x36.inference.ModelRepositoryAgents.Agent.ParametersEntry\x1a\x31\n\x0fParametersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"$\n\x12ModelResponseCache\x12\x0e\n\x06\x65nable\x18\x01 \x01(\x08\"\xb2\n\n\x0bModelConfig\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08platform\x18\x02 \x01(\t\x12\x0f\n\x07\x62\x61\x63kend\x18\x11 \x01(\t\x12\x35\n\x0eversion_policy\x18\x03 \x01(\x0b\x32\x1d.inference.ModelVersionPolicy\x12\x16\n\x0emax_batch_size\x18\x04 \x01(\x05\x12$\n\x05input\x18\x05 \x03(\x0b\x32\x15.inference.ModelInput\x12&\n\x06output\x18\x06 \x03(\x0b\x32\x16.inference.ModelOutput\x12*\n\x0b\x62\x61tch_input\x18\x14 \x03(\x0b\x32\x15.inference.BatchInput\x12,\n\x0c\x62\x61tch_output\x18\x15 \x03(\x0b\x32\x16.inference.BatchOutput\x12\x38\n\x0coptimization\x18\x0c \x01(\x0b\x32\".inference.ModelOptimizationPolicy\x12;\n\x10\x64ynamic_batching\x18\x0b \x01(\x0b\x32\x1f.inference.ModelDynamicBatchingH\x00\x12=\n\x11sequence_batching\x18\r \x01(\x0b\x32 .inference.ModelSequenceBatchingH\x00\x12\x39\n\x13\x65nsemble_scheduling\x18\x0f \x01(\x0b\x32\x1a.inference.ModelEnsemblingH\x00\x12\x35\n\x0einstance_group\x18\x07 \x03(\x0b\x32\x1d.inference.ModelInstanceGroup\x12\x1e\n\x16\x64\x65\x66\x61ult_model_filename\x18\x08 \x01(\t\x12H\n\x12\x63\x63_model_filenames\x18\t \x03(\x0b\x32,.inference.ModelConfig.CcModelFilenamesEntry\x12;\n\x0bmetric_tags\x18\n \x03(\x0b\x32&.inference.ModelConfig.MetricTagsEntry\x12:\n\nparameters\x18\x0e \x03(\x0b\x32&.inference.ModelConfig.ParametersEntry\x12,\n\x0cmodel_warmup\x18\x10 \x03(\x0b\x32\x16.inference.ModelWarmup\x12\x34\n\x10model_operations\x18\x12 \x01(\x0b\x32\x1a.inference.ModelOperations\x12\x43\n\x18model_transaction_policy\x18\x13 \x01(\x0b\x32!.inference.ModelTransactionPolicy\x12\x41\n\x17model_repository_agents\x18\x17 \x01(\x0b\x32 .inference.ModelRepositoryAgents\x12\x35\n\x0eresponse_cache\x18\x18 \x01(\x0b\x32\x1d.inference.ModelResponseCache\x1a\x37\n\x15\x43\x63ModelFilenamesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x31\n\x0fMetricTagsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1aL\n\x0fParametersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12(\n\x05value\x18\x02 \x01(\x0b\x32\x19.inference.ModelParameter:\x02\x38\x01\x42\x13\n\x11scheduling_choice*\xfa\x01\n\x08\x44\x61taType\x12\x10\n\x0cTYPE_INVALID\x10\x00\x12\r\n\tTYPE_BOOL\x10\x01\x12\x0e\n\nTYPE_UINT8\x10\x02\x12\x0f\n\x0bTYPE_UINT16\x10\x03\x12\x0f\n\x0bTYPE_UINT32\x10\x04\x12\x0f\n\x0bTYPE_UINT64\x10\x05\x12\r\n\tTYPE_INT8\x10\x06\x12\x0e\n\nTYPE_INT16\x10\x07\x12\x0e\n\nTYPE_INT32\x10\x08\x12\x0e\n\nTYPE_INT64\x10\t\x12\r\n\tTYPE_FP16\x10\n\x12\r\n\tTYPE_FP32\x10\x0b\x12\r\n\tTYPE_FP64\x10\x0c\x12\x0f\n\x0bTYPE_STRING\x10\r\x12\r\n\tTYPE_BF16\x10\x0e\x62\x06proto3'
+)
+_DATATYPE = DESCRIPTOR.enum_types_by_name['DataType']
+DataType = enum_type_wrapper.EnumTypeWrapper(_DATATYPE)
+TYPE_INVALID = 0
+TYPE_BOOL = 1
+TYPE_UINT8 = 2
+TYPE_UINT16 = 3
+TYPE_UINT32 = 4
+TYPE_UINT64 = 5
+TYPE_INT8 = 6
+TYPE_INT16 = 7
+TYPE_INT32 = 8
+TYPE_INT64 = 9
+TYPE_FP16 = 10
+TYPE_FP32 = 11
+TYPE_FP64 = 12
+TYPE_STRING = 13
+TYPE_BF16 = 14
+_MODELRATELIMITER = DESCRIPTOR.message_types_by_name['ModelRateLimiter']
+_MODELRATELIMITER_RESOURCE = _MODELRATELIMITER.nested_types_by_name['Resource']
+_MODELINSTANCEGROUP = DESCRIPTOR.message_types_by_name['ModelInstanceGroup']
+_MODELINSTANCEGROUP_SECONDARYDEVICE = _MODELINSTANCEGROUP.nested_types_by_name[
+    'SecondaryDevice']
+_MODELTENSORRESHAPE = DESCRIPTOR.message_types_by_name['ModelTensorReshape']
+_MODELINPUT = DESCRIPTOR.message_types_by_name['ModelInput']
+_MODELOUTPUT = DESCRIPTOR.message_types_by_name['ModelOutput']
+_BATCHINPUT = DESCRIPTOR.message_types_by_name['BatchInput']
+_BATCHOUTPUT = DESCRIPTOR.message_types_by_name['BatchOutput']
+_MODELVERSIONPOLICY = DESCRIPTOR.message_types_by_name['ModelVersionPolicy']
+_MODELVERSIONPOLICY_LATEST = _MODELVERSIONPOLICY.nested_types_by_name['Latest']
+_MODELVERSIONPOLICY_ALL = _MODELVERSIONPOLICY.nested_types_by_name['All']
+_MODELVERSIONPOLICY_SPECIFIC = _MODELVERSIONPOLICY.nested_types_by_name[
+    'Specific']
+_MODELOPTIMIZATIONPOLICY = DESCRIPTOR.message_types_by_name[
+    'ModelOptimizationPolicy']
+_MODELOPTIMIZATIONPOLICY_GRAPH = _MODELOPTIMIZATIONPOLICY.nested_types_by_name[
+    'Graph']
+_MODELOPTIMIZATIONPOLICY_CUDA = _MODELOPTIMIZATIONPOLICY.nested_types_by_name[
+    'Cuda']
+_MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC = _MODELOPTIMIZATIONPOLICY_CUDA.nested_types_by_name[
+    'GraphSpec']
+_MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_SHAPE = _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC.nested_types_by_name[
+    'Shape']
+_MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND = _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC.nested_types_by_name[
+    'LowerBound']
+_MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY = _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND.nested_types_by_name[
+    'InputEntry']
+_MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY = _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC.nested_types_by_name[
+    'InputEntry']
+_MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS = _MODELOPTIMIZATIONPOLICY.nested_types_by_name[
+    'ExecutionAccelerators']
+_MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR = _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS.nested_types_by_name[
+    'Accelerator']
+_MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY = _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR.nested_types_by_name[
+    'ParametersEntry']
+_MODELOPTIMIZATIONPOLICY_PINNEDMEMORYBUFFER = _MODELOPTIMIZATIONPOLICY.nested_types_by_name[
+    'PinnedMemoryBuffer']
+_MODELQUEUEPOLICY = DESCRIPTOR.message_types_by_name['ModelQueuePolicy']
+_MODELDYNAMICBATCHING = DESCRIPTOR.message_types_by_name[
+    'ModelDynamicBatching']
+_MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY = _MODELDYNAMICBATCHING.nested_types_by_name[
+    'PriorityQueuePolicyEntry']
+_MODELSEQUENCEBATCHING = DESCRIPTOR.message_types_by_name[
+    'ModelSequenceBatching']
+_MODELSEQUENCEBATCHING_CONTROL = _MODELSEQUENCEBATCHING.nested_types_by_name[
+    'Control']
+_MODELSEQUENCEBATCHING_CONTROLINPUT = _MODELSEQUENCEBATCHING.nested_types_by_name[
+    'ControlInput']
+_MODELSEQUENCEBATCHING_INITIALSTATE = _MODELSEQUENCEBATCHING.nested_types_by_name[
+    'InitialState']
+_MODELSEQUENCEBATCHING_STATE = _MODELSEQUENCEBATCHING.nested_types_by_name[
+    'State']
+_MODELSEQUENCEBATCHING_STRATEGYDIRECT = _MODELSEQUENCEBATCHING.nested_types_by_name[
+    'StrategyDirect']
+_MODELSEQUENCEBATCHING_STRATEGYOLDEST = _MODELSEQUENCEBATCHING.nested_types_by_name[
+    'StrategyOldest']
+_MODELENSEMBLING = DESCRIPTOR.message_types_by_name['ModelEnsembling']
+_MODELENSEMBLING_STEP = _MODELENSEMBLING.nested_types_by_name['Step']
+_MODELENSEMBLING_STEP_INPUTMAPENTRY = _MODELENSEMBLING_STEP.nested_types_by_name[
+    'InputMapEntry']
+_MODELENSEMBLING_STEP_OUTPUTMAPENTRY = _MODELENSEMBLING_STEP.nested_types_by_name[
+    'OutputMapEntry']
+_MODELPARAMETER = DESCRIPTOR.message_types_by_name['ModelParameter']
+_MODELWARMUP = DESCRIPTOR.message_types_by_name['ModelWarmup']
+_MODELWARMUP_INPUT = _MODELWARMUP.nested_types_by_name['Input']
+_MODELWARMUP_INPUTSENTRY = _MODELWARMUP.nested_types_by_name['InputsEntry']
+_MODELOPERATIONS = DESCRIPTOR.message_types_by_name['ModelOperations']
+_MODELTRANSACTIONPOLICY = DESCRIPTOR.message_types_by_name[
+    'ModelTransactionPolicy']
+_MODELREPOSITORYAGENTS = DESCRIPTOR.message_types_by_name[
+    'ModelRepositoryAgents']
+_MODELREPOSITORYAGENTS_AGENT = _MODELREPOSITORYAGENTS.nested_types_by_name[
+    'Agent']
+_MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY = _MODELREPOSITORYAGENTS_AGENT.nested_types_by_name[
+    'ParametersEntry']
+_MODELRESPONSECACHE = DESCRIPTOR.message_types_by_name['ModelResponseCache']
+_MODELCONFIG = DESCRIPTOR.message_types_by_name['ModelConfig']
+_MODELCONFIG_CCMODELFILENAMESENTRY = _MODELCONFIG.nested_types_by_name[
+    'CcModelFilenamesEntry']
+_MODELCONFIG_METRICTAGSENTRY = _MODELCONFIG.nested_types_by_name[
+    'MetricTagsEntry']
+_MODELCONFIG_PARAMETERSENTRY = _MODELCONFIG.nested_types_by_name[
+    'ParametersEntry']
+_MODELINSTANCEGROUP_SECONDARYDEVICE_SECONDARYDEVICEKIND = _MODELINSTANCEGROUP_SECONDARYDEVICE.enum_types_by_name[
+    'SecondaryDeviceKind']
+_MODELINSTANCEGROUP_KIND = _MODELINSTANCEGROUP.enum_types_by_name['Kind']
+_MODELINPUT_FORMAT = _MODELINPUT.enum_types_by_name['Format']
+_BATCHINPUT_KIND = _BATCHINPUT.enum_types_by_name['Kind']
+_BATCHOUTPUT_KIND = _BATCHOUTPUT.enum_types_by_name['Kind']
+_MODELOPTIMIZATIONPOLICY_MODELPRIORITY = _MODELOPTIMIZATIONPOLICY.enum_types_by_name[
+    'ModelPriority']
+_MODELQUEUEPOLICY_TIMEOUTACTION = _MODELQUEUEPOLICY.enum_types_by_name[
+    'TimeoutAction']
+_MODELSEQUENCEBATCHING_CONTROL_KIND = _MODELSEQUENCEBATCHING_CONTROL.enum_types_by_name[
+    'Kind']
+ModelRateLimiter = _reflection.GeneratedProtocolMessageType(
+    'ModelRateLimiter',
+    (_message.Message, ),
+    {
+        'Resource':
+        _reflection.GeneratedProtocolMessageType(
+            'Resource',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELRATELIMITER_RESOURCE,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelRateLimiter.Resource)
+            }),
+        'DESCRIPTOR':
+        _MODELRATELIMITER,
+        '__module__':
+        'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelRateLimiter)
+    })
+_sym_db.RegisterMessage(ModelRateLimiter)
+_sym_db.RegisterMessage(ModelRateLimiter.Resource)
+ModelInstanceGroup = _reflection.GeneratedProtocolMessageType(
+    'ModelInstanceGroup',
+    (_message.Message, ),
+    {
+        'SecondaryDevice':
+        _reflection.GeneratedProtocolMessageType(
+            'SecondaryDevice',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELINSTANCEGROUP_SECONDARYDEVICE,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelInstanceGroup.SecondaryDevice)
+            }),
+        'DESCRIPTOR':
+        _MODELINSTANCEGROUP,
+        '__module__':
+        'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelInstanceGroup)
+    })
+_sym_db.RegisterMessage(ModelInstanceGroup)
+_sym_db.RegisterMessage(ModelInstanceGroup.SecondaryDevice)
+ModelTensorReshape = _reflection.GeneratedProtocolMessageType(
+    'ModelTensorReshape',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _MODELTENSORRESHAPE,
+        '__module__': 'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelTensorReshape)
+    })
+_sym_db.RegisterMessage(ModelTensorReshape)
+ModelInput = _reflection.GeneratedProtocolMessageType(
+    'ModelInput',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _MODELINPUT,
+        '__module__': 'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelInput)
+    })
+_sym_db.RegisterMessage(ModelInput)
+ModelOutput = _reflection.GeneratedProtocolMessageType(
+    'ModelOutput',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _MODELOUTPUT,
+        '__module__': 'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelOutput)
+    })
+_sym_db.RegisterMessage(ModelOutput)
+BatchInput = _reflection.GeneratedProtocolMessageType(
+    'BatchInput',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _BATCHINPUT,
+        '__module__': 'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.BatchInput)
+    })
+_sym_db.RegisterMessage(BatchInput)
+BatchOutput = _reflection.GeneratedProtocolMessageType(
+    'BatchOutput',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _BATCHOUTPUT,
+        '__module__': 'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.BatchOutput)
+    })
+_sym_db.RegisterMessage(BatchOutput)
+ModelVersionPolicy = _reflection.GeneratedProtocolMessageType(
+    'ModelVersionPolicy',
+    (_message.Message, ),
+    {
+        'Latest':
+        _reflection.GeneratedProtocolMessageType(
+            'Latest',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELVERSIONPOLICY_LATEST,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelVersionPolicy.Latest)
+            }),
+        'All':
+        _reflection.GeneratedProtocolMessageType(
+            'All',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELVERSIONPOLICY_ALL,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelVersionPolicy.All)
+            }),
+        'Specific':
+        _reflection.GeneratedProtocolMessageType(
+            'Specific',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELVERSIONPOLICY_SPECIFIC,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelVersionPolicy.Specific)
+            }),
+        'DESCRIPTOR':
+        _MODELVERSIONPOLICY,
+        '__module__':
+        'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelVersionPolicy)
+    })
+_sym_db.RegisterMessage(ModelVersionPolicy)
+_sym_db.RegisterMessage(ModelVersionPolicy.Latest)
+_sym_db.RegisterMessage(ModelVersionPolicy.All)
+_sym_db.RegisterMessage(ModelVersionPolicy.Specific)
+ModelOptimizationPolicy = _reflection.GeneratedProtocolMessageType(
+    'ModelOptimizationPolicy',
+    (_message.Message, ),
+    {
+        'Graph':
+        _reflection.GeneratedProtocolMessageType(
+            'Graph',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELOPTIMIZATIONPOLICY_GRAPH,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Graph)
+            }),
+        'Cuda':
+        _reflection.GeneratedProtocolMessageType(
+            'Cuda',
+            (_message.Message, ),
+            {
+                'GraphSpec':
+                _reflection.GeneratedProtocolMessageType(
+                    'GraphSpec',
+                    (_message.Message, ),
+                    {
+                        'Shape':
+                        _reflection.GeneratedProtocolMessageType(
+                            'Shape',
+                            (_message.Message, ),
+                            {
+                                'DESCRIPTOR':
+                                _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_SHAPE,
+                                '__module__': 'model_config.protxt_pb2'
+                                # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda.GraphSpec.Shape)
+                            }),
+                        'LowerBound':
+                        _reflection.GeneratedProtocolMessageType(
+                            'LowerBound',
+                            (_message.Message, ),
+                            {
+                                'InputEntry':
+                                _reflection.GeneratedProtocolMessageType(
+                                    'InputEntry',
+                                    (_message.Message, ),
+                                    {
+                                        'DESCRIPTOR':
+                                        _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY,
+                                        '__module__': 'model_config.protxt_pb2'
+                                        # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound.InputEntry)
+                                    }),
+                                'DESCRIPTOR':
+                                _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND,
+                                '__module__':
+                                'model_config.protxt_pb2'
+                                # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound)
+                            }),
+                        'InputEntry':
+                        _reflection.GeneratedProtocolMessageType(
+                            'InputEntry',
+                            (_message.Message, ),
+                            {
+                                'DESCRIPTOR':
+                                _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY,
+                                '__module__': 'model_config.protxt_pb2'
+                                # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda.GraphSpec.InputEntry)
+                            }),
+                        'DESCRIPTOR':
+                        _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC,
+                        '__module__':
+                        'model_config.protxt_pb2'
+                        # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda.GraphSpec)
+                    }),
+                'DESCRIPTOR':
+                _MODELOPTIMIZATIONPOLICY_CUDA,
+                '__module__':
+                'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda)
+            }),
+        'ExecutionAccelerators':
+        _reflection.GeneratedProtocolMessageType(
+            'ExecutionAccelerators',
+            (_message.Message, ),
+            {
+                'Accelerator':
+                _reflection.GeneratedProtocolMessageType(
+                    'Accelerator',
+                    (_message.Message, ),
+                    {
+                        'ParametersEntry':
+                        _reflection.GeneratedProtocolMessageType(
+                            'ParametersEntry',
+                            (_message.Message, ),
+                            {
+                                'DESCRIPTOR':
+                                _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY,
+                                '__module__': 'model_config.protxt_pb2'
+                                # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.ExecutionAccelerators.Accelerator.ParametersEntry)
+                            }),
+                        'DESCRIPTOR':
+                        _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR,
+                        '__module__':
+                        'model_config.protxt_pb2'
+                        # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.ExecutionAccelerators.Accelerator)
+                    }),
+                'DESCRIPTOR':
+                _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS,
+                '__module__':
+                'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.ExecutionAccelerators)
+            }),
+        'PinnedMemoryBuffer':
+        _reflection.GeneratedProtocolMessageType(
+            'PinnedMemoryBuffer',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELOPTIMIZATIONPOLICY_PINNEDMEMORYBUFFER,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.PinnedMemoryBuffer)
+            }),
+        'DESCRIPTOR':
+        _MODELOPTIMIZATIONPOLICY,
+        '__module__':
+        'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy)
+    })
+_sym_db.RegisterMessage(ModelOptimizationPolicy)
+_sym_db.RegisterMessage(ModelOptimizationPolicy.Graph)
+_sym_db.RegisterMessage(ModelOptimizationPolicy.Cuda)
+_sym_db.RegisterMessage(ModelOptimizationPolicy.Cuda.GraphSpec)
+_sym_db.RegisterMessage(ModelOptimizationPolicy.Cuda.GraphSpec.Shape)
+_sym_db.RegisterMessage(ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound)
+_sym_db.RegisterMessage(
+    ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound.InputEntry)
+_sym_db.RegisterMessage(ModelOptimizationPolicy.Cuda.GraphSpec.InputEntry)
+_sym_db.RegisterMessage(ModelOptimizationPolicy.ExecutionAccelerators)
+_sym_db.RegisterMessage(
+    ModelOptimizationPolicy.ExecutionAccelerators.Accelerator)
+_sym_db.RegisterMessage(
+    ModelOptimizationPolicy.ExecutionAccelerators.Accelerator.ParametersEntry)
+_sym_db.RegisterMessage(ModelOptimizationPolicy.PinnedMemoryBuffer)
+ModelQueuePolicy = _reflection.GeneratedProtocolMessageType(
+    'ModelQueuePolicy',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _MODELQUEUEPOLICY,
+        '__module__': 'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelQueuePolicy)
+    })
+_sym_db.RegisterMessage(ModelQueuePolicy)
+ModelDynamicBatching = _reflection.GeneratedProtocolMessageType(
+    'ModelDynamicBatching',
+    (_message.Message, ),
+    {
+        'PriorityQueuePolicyEntry':
+        _reflection.GeneratedProtocolMessageType(
+            'PriorityQueuePolicyEntry',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelDynamicBatching.PriorityQueuePolicyEntry)
+            }),
+        'DESCRIPTOR':
+        _MODELDYNAMICBATCHING,
+        '__module__':
+        'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelDynamicBatching)
+    })
+_sym_db.RegisterMessage(ModelDynamicBatching)
+_sym_db.RegisterMessage(ModelDynamicBatching.PriorityQueuePolicyEntry)
+ModelSequenceBatching = _reflection.GeneratedProtocolMessageType(
+    'ModelSequenceBatching',
+    (_message.Message, ),
+    {
+        'Control':
+        _reflection.GeneratedProtocolMessageType(
+            'Control',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELSEQUENCEBATCHING_CONTROL,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.Control)
+            }),
+        'ControlInput':
+        _reflection.GeneratedProtocolMessageType(
+            'ControlInput',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELSEQUENCEBATCHING_CONTROLINPUT,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.ControlInput)
+            }),
+        'InitialState':
+        _reflection.GeneratedProtocolMessageType(
+            'InitialState',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELSEQUENCEBATCHING_INITIALSTATE,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.InitialState)
+            }),
+        'State':
+        _reflection.GeneratedProtocolMessageType(
+            'State',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELSEQUENCEBATCHING_STATE,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.State)
+            }),
+        'StrategyDirect':
+        _reflection.GeneratedProtocolMessageType(
+            'StrategyDirect',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELSEQUENCEBATCHING_STRATEGYDIRECT,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.StrategyDirect)
+            }),
+        'StrategyOldest':
+        _reflection.GeneratedProtocolMessageType(
+            'StrategyOldest',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELSEQUENCEBATCHING_STRATEGYOLDEST,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.StrategyOldest)
+            }),
+        'DESCRIPTOR':
+        _MODELSEQUENCEBATCHING,
+        '__module__':
+        'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching)
+    })
+_sym_db.RegisterMessage(ModelSequenceBatching)
+_sym_db.RegisterMessage(ModelSequenceBatching.Control)
+_sym_db.RegisterMessage(ModelSequenceBatching.ControlInput)
+_sym_db.RegisterMessage(ModelSequenceBatching.InitialState)
+_sym_db.RegisterMessage(ModelSequenceBatching.State)
+_sym_db.RegisterMessage(ModelSequenceBatching.StrategyDirect)
+_sym_db.RegisterMessage(ModelSequenceBatching.StrategyOldest)
+ModelEnsembling = _reflection.GeneratedProtocolMessageType(
+    'ModelEnsembling',
+    (_message.Message, ),
+    {
+        'Step':
+        _reflection.GeneratedProtocolMessageType(
+            'Step',
+            (_message.Message, ),
+            {
+                'InputMapEntry':
+                _reflection.GeneratedProtocolMessageType(
+                    'InputMapEntry',
+                    (_message.Message, ),
+                    {
+                        'DESCRIPTOR': _MODELENSEMBLING_STEP_INPUTMAPENTRY,
+                        '__module__': 'model_config.protxt_pb2'
+                        # @@protoc_insertion_point(class_scope:inference.ModelEnsembling.Step.InputMapEntry)
+                    }),
+                'OutputMapEntry':
+                _reflection.GeneratedProtocolMessageType(
+                    'OutputMapEntry',
+                    (_message.Message, ),
+                    {
+                        'DESCRIPTOR': _MODELENSEMBLING_STEP_OUTPUTMAPENTRY,
+                        '__module__': 'model_config.protxt_pb2'
+                        # @@protoc_insertion_point(class_scope:inference.ModelEnsembling.Step.OutputMapEntry)
+                    }),
+                'DESCRIPTOR':
+                _MODELENSEMBLING_STEP,
+                '__module__':
+                'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelEnsembling.Step)
+            }),
+        'DESCRIPTOR':
+        _MODELENSEMBLING,
+        '__module__':
+        'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelEnsembling)
+    })
+_sym_db.RegisterMessage(ModelEnsembling)
+_sym_db.RegisterMessage(ModelEnsembling.Step)
+_sym_db.RegisterMessage(ModelEnsembling.Step.InputMapEntry)
+_sym_db.RegisterMessage(ModelEnsembling.Step.OutputMapEntry)
+ModelParameter = _reflection.GeneratedProtocolMessageType(
+    'ModelParameter',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _MODELPARAMETER,
+        '__module__': 'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelParameter)
+    })
+_sym_db.RegisterMessage(ModelParameter)
+ModelWarmup = _reflection.GeneratedProtocolMessageType(
+    'ModelWarmup',
+    (_message.Message, ),
+    {
+        'Input':
+        _reflection.GeneratedProtocolMessageType(
+            'Input',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELWARMUP_INPUT,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelWarmup.Input)
+            }),
+        'InputsEntry':
+        _reflection.GeneratedProtocolMessageType(
+            'InputsEntry',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELWARMUP_INPUTSENTRY,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelWarmup.InputsEntry)
+            }),
+        'DESCRIPTOR':
+        _MODELWARMUP,
+        '__module__':
+        'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelWarmup)
+    })
+_sym_db.RegisterMessage(ModelWarmup)
+_sym_db.RegisterMessage(ModelWarmup.Input)
+_sym_db.RegisterMessage(ModelWarmup.InputsEntry)
+ModelOperations = _reflection.GeneratedProtocolMessageType(
+    'ModelOperations',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _MODELOPERATIONS,
+        '__module__': 'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelOperations)
+    })
+_sym_db.RegisterMessage(ModelOperations)
+ModelTransactionPolicy = _reflection.GeneratedProtocolMessageType(
+    'ModelTransactionPolicy',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _MODELTRANSACTIONPOLICY,
+        '__module__': 'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelTransactionPolicy)
+    })
+_sym_db.RegisterMessage(ModelTransactionPolicy)
+ModelRepositoryAgents = _reflection.GeneratedProtocolMessageType(
+    'ModelRepositoryAgents',
+    (_message.Message, ),
+    {
+        'Agent':
+        _reflection.GeneratedProtocolMessageType(
+            'Agent',
+            (_message.Message, ),
+            {
+                'ParametersEntry':
+                _reflection.GeneratedProtocolMessageType(
+                    'ParametersEntry',
+                    (_message.Message, ),
+                    {
+                        'DESCRIPTOR':
+                        _MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY,
+                        '__module__': 'model_config.protxt_pb2'
+                        # @@protoc_insertion_point(class_scope:inference.ModelRepositoryAgents.Agent.ParametersEntry)
+                    }),
+                'DESCRIPTOR':
+                _MODELREPOSITORYAGENTS_AGENT,
+                '__module__':
+                'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelRepositoryAgents.Agent)
+            }),
+        'DESCRIPTOR':
+        _MODELREPOSITORYAGENTS,
+        '__module__':
+        'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelRepositoryAgents)
+    })
+_sym_db.RegisterMessage(ModelRepositoryAgents)
+_sym_db.RegisterMessage(ModelRepositoryAgents.Agent)
+_sym_db.RegisterMessage(ModelRepositoryAgents.Agent.ParametersEntry)
+ModelResponseCache = _reflection.GeneratedProtocolMessageType(
+    'ModelResponseCache',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _MODELRESPONSECACHE,
+        '__module__': 'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelResponseCache)
+    })
+_sym_db.RegisterMessage(ModelResponseCache)
+ModelConfig = _reflection.GeneratedProtocolMessageType(
+    'ModelConfig',
+    (_message.Message, ),
+    {
+        'CcModelFilenamesEntry':
+        _reflection.GeneratedProtocolMessageType(
+            'CcModelFilenamesEntry',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELCONFIG_CCMODELFILENAMESENTRY,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelConfig.CcModelFilenamesEntry)
+            }),
+        'MetricTagsEntry':
+        _reflection.GeneratedProtocolMessageType(
+            'MetricTagsEntry',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELCONFIG_METRICTAGSENTRY,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelConfig.MetricTagsEntry)
+            }),
+        'ParametersEntry':
+        _reflection.GeneratedProtocolMessageType(
+            'ParametersEntry',
+            (_message.Message, ),
+            {
+                'DESCRIPTOR': _MODELCONFIG_PARAMETERSENTRY,
+                '__module__': 'model_config.protxt_pb2'
+                # @@protoc_insertion_point(class_scope:inference.ModelConfig.ParametersEntry)
+            }),
+        'DESCRIPTOR':
+        _MODELCONFIG,
+        '__module__':
+        'model_config.protxt_pb2'
+        # @@protoc_insertion_point(class_scope:inference.ModelConfig)
+    })
+_sym_db.RegisterMessage(ModelConfig)
+_sym_db.RegisterMessage(ModelConfig.CcModelFilenamesEntry)
+_sym_db.RegisterMessage(ModelConfig.MetricTagsEntry)
+_sym_db.RegisterMessage(ModelConfig.ParametersEntry)
+if _descriptor._USE_C_DESCRIPTORS == False:
+    DESCRIPTOR._options = None
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY._options = None
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY._serialized_options = b'8\001'
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY._options = None
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY._serialized_options = b'8\001'
+    _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY._options = None
+    _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY._serialized_options = b'8\001'
+    _MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY._options = None
+    _MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY._serialized_options = b'8\001'
+    _MODELENSEMBLING_STEP_INPUTMAPENTRY._options = None
+    _MODELENSEMBLING_STEP_INPUTMAPENTRY._serialized_options = b'8\001'
+    _MODELENSEMBLING_STEP_OUTPUTMAPENTRY._options = None
+    _MODELENSEMBLING_STEP_OUTPUTMAPENTRY._serialized_options = b'8\001'
+    _MODELWARMUP_INPUTSENTRY._options = None
+    _MODELWARMUP_INPUTSENTRY._serialized_options = b'8\001'
+    _MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY._options = None
+    _MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY._serialized_options = b'8\001'
+    _MODELCONFIG_CCMODELFILENAMESENTRY._options = None
+    _MODELCONFIG_CCMODELFILENAMESENTRY._serialized_options = b'8\001'
+    _MODELCONFIG_METRICTAGSENTRY._options = None
+    _MODELCONFIG_METRICTAGSENTRY._serialized_options = b'8\001'
+    _MODELCONFIG_PARAMETERSENTRY._options = None
+    _MODELCONFIG_PARAMETERSENTRY._serialized_options = b'8\001'
+    _DATATYPE._serialized_start = 8137
+    _DATATYPE._serialized_end = 8387
+    _MODELRATELIMITER._serialized_start = 35
+    _MODELRATELIMITER._serialized_end = 185
+    _MODELRATELIMITER_RESOURCE._serialized_start = 130
+    _MODELRATELIMITER_RESOURCE._serialized_end = 185
+    _MODELINSTANCEGROUP._serialized_start = 188
+    _MODELINSTANCEGROUP._serialized_end = 707
+    _MODELINSTANCEGROUP_SECONDARYDEVICE._serialized_start = 484
+    _MODELINSTANCEGROUP_SECONDARYDEVICE._serialized_end = 640
+    _MODELINSTANCEGROUP_SECONDARYDEVICE_SECONDARYDEVICEKIND._serialized_start = 603
+    _MODELINSTANCEGROUP_SECONDARYDEVICE_SECONDARYDEVICEKIND._serialized_end = 640
+    _MODELINSTANCEGROUP_KIND._serialized_start = 642
+    _MODELINSTANCEGROUP_KIND._serialized_end = 707
+    _MODELTENSORRESHAPE._serialized_start = 709
+    _MODELTENSORRESHAPE._serialized_end = 744
+    _MODELINPUT._serialized_start = 747
+    _MODELINPUT._serialized_end = 1053
+    _MODELINPUT_FORMAT._serialized_start = 994
+    _MODELINPUT_FORMAT._serialized_end = 1053
+    _MODELOUTPUT._serialized_start = 1056
+    _MODELOUTPUT._serialized_end = 1234
+    _BATCHINPUT._serialized_start = 1237
+    _BATCHINPUT._serialized_end = 1582
+    _BATCHINPUT_KIND._serialized_start = 1377
+    _BATCHINPUT_KIND._serialized_end = 1582
+    _BATCHOUTPUT._serialized_start = 1585
+    _BATCHOUTPUT._serialized_end = 1728
+    _BATCHOUTPUT_KIND._serialized_start = 1686
+    _BATCHOUTPUT_KIND._serialized_end = 1728
+    _MODELVERSIONPOLICY._serialized_start = 1731
+    _MODELVERSIONPOLICY._serialized_end = 2003
+    _MODELVERSIONPOLICY_LATEST._serialized_start = 1919
+    _MODELVERSIONPOLICY_LATEST._serialized_end = 1949
+    _MODELVERSIONPOLICY_ALL._serialized_start = 1951
+    _MODELVERSIONPOLICY_ALL._serialized_end = 1956
+    _MODELVERSIONPOLICY_SPECIFIC._serialized_start = 1958
+    _MODELVERSIONPOLICY_SPECIFIC._serialized_end = 1986
+    _MODELOPTIMIZATIONPOLICY._serialized_start = 2006
+    _MODELOPTIMIZATIONPOLICY._serialized_end = 3795
+    _MODELOPTIMIZATIONPOLICY_GRAPH._serialized_start = 2536
+    _MODELOPTIMIZATIONPOLICY_GRAPH._serialized_end = 2558
+    _MODELOPTIMIZATIONPOLICY_CUDA._serialized_start = 2561
+    _MODELOPTIMIZATIONPOLICY_CUDA._serialized_end = 3259
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC._serialized_start = 2711
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC._serialized_end = 3259
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_SHAPE._serialized_start = 2910
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_SHAPE._serialized_end = 2930
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND._serialized_start = 2933
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND._serialized_end = 3156
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY._serialized_start = 3055
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY._serialized_end = 3156
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY._serialized_start = 3055
+    _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY._serialized_end = 3156
+    _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS._serialized_start = 3262
+    _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS._serialized_end = 3682
+    _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR._serialized_start = 3498
+    _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR._serialized_end = 3682
+    _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY._serialized_start = 3633
+    _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY._serialized_end = 3682
+    _MODELOPTIMIZATIONPOLICY_PINNEDMEMORYBUFFER._serialized_start = 3684
+    _MODELOPTIMIZATIONPOLICY_PINNEDMEMORYBUFFER._serialized_end = 3720
+    _MODELOPTIMIZATIONPOLICY_MODELPRIORITY._serialized_start = 3722
+    _MODELOPTIMIZATIONPOLICY_MODELPRIORITY._serialized_end = 3795
+    _MODELQUEUEPOLICY._serialized_start = 3798
+    _MODELQUEUEPOLICY._serialized_end = 4017
+    _MODELQUEUEPOLICY_TIMEOUTACTION._serialized_start = 3979
+    _MODELQUEUEPOLICY_TIMEOUTACTION._serialized_end = 4017
+    _MODELDYNAMICBATCHING._serialized_start = 4020
+    _MODELDYNAMICBATCHING._serialized_end = 4431
+    _MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY._serialized_start = 4344
+    _MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY._serialized_end = 4431
+    _MODELSEQUENCEBATCHING._serialized_start = 4434
+    _MODELSEQUENCEBATCHING._serialized_end = 5697
+    _MODELSEQUENCEBATCHING_CONTROL._serialized_start = 4759
+    _MODELSEQUENCEBATCHING_CONTROL._serialized_end = 5064
+    _MODELSEQUENCEBATCHING_CONTROL_KIND._serialized_start = 4947
+    _MODELSEQUENCEBATCHING_CONTROL_KIND._serialized_end = 5064
+    _MODELSEQUENCEBATCHING_CONTROLINPUT._serialized_start = 5066
+    _MODELSEQUENCEBATCHING_CONTROLINPUT._serialized_end = 5153
+    _MODELSEQUENCEBATCHING_INITIALSTATE._serialized_start = 5156
+    _MODELSEQUENCEBATCHING_INITIALSTATE._serialized_end = 5294
+    _MODELSEQUENCEBATCHING_STATE._serialized_start = 5297
+    _MODELSEQUENCEBATCHING_STATE._serialized_end = 5469
+    _MODELSEQUENCEBATCHING_STRATEGYDIRECT._serialized_start = 5471
+    _MODELSEQUENCEBATCHING_STRATEGYDIRECT._serialized_end = 5559
+    _MODELSEQUENCEBATCHING_STRATEGYOLDEST._serialized_start = 5561
+    _MODELSEQUENCEBATCHING_STRATEGYOLDEST._serialized_end = 5678
+    _MODELENSEMBLING._serialized_start = 5700
+    _MODELENSEMBLING._serialized_end = 6049
+    _MODELENSEMBLING_STEP._serialized_start = 5767
+    _MODELENSEMBLING_STEP._serialized_end = 6049
+    _MODELENSEMBLING_STEP_INPUTMAPENTRY._serialized_start = 5952
+    _MODELENSEMBLING_STEP_INPUTMAPENTRY._serialized_end = 5999
+    _MODELENSEMBLING_STEP_OUTPUTMAPENTRY._serialized_start = 6001
+    _MODELENSEMBLING_STEP_OUTPUTMAPENTRY._serialized_end = 6049
+    _MODELPARAMETER._serialized_start = 6051
+    _MODELPARAMETER._serialized_end = 6089
+    _MODELWARMUP._serialized_start = 6092
+    _MODELWARMUP._serialized_end = 6437
+    _MODELWARMUP_INPUT._serialized_start = 6209
+    _MODELWARMUP_INPUT._serialized_end = 6360
+    _MODELWARMUP_INPUTSENTRY._serialized_start = 6362
+    _MODELWARMUP_INPUTSENTRY._serialized_end = 6437
+    _MODELOPERATIONS._serialized_start = 6439
+    _MODELOPERATIONS._serialized_end = 6485
+    _MODELTRANSACTIONPOLICY._serialized_start = 6487
+    _MODELTRANSACTIONPOLICY._serialized_end = 6530
+    _MODELREPOSITORYAGENTS._serialized_start = 6533
+    _MODELREPOSITORYAGENTS._serialized_end = 6763
+    _MODELREPOSITORYAGENTS_AGENT._serialized_start = 6615
+    _MODELREPOSITORYAGENTS_AGENT._serialized_end = 6763
+    _MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY._serialized_start = 3633
+    _MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY._serialized_end = 3682
+    _MODELRESPONSECACHE._serialized_start = 6765
+    _MODELRESPONSECACHE._serialized_end = 6801
+    _MODELCONFIG._serialized_start = 6804
+    _MODELCONFIG._serialized_end = 8134
+    _MODELCONFIG_CCMODELFILENAMESENTRY._serialized_start = 7929
+    _MODELCONFIG_CCMODELFILENAMESENTRY._serialized_end = 7984
+    _MODELCONFIG_METRICTAGSENTRY._serialized_start = 7986
+    _MODELCONFIG_METRICTAGSENTRY._serialized_end = 8035
+    _MODELCONFIG_PARAMETERSENTRY._serialized_start = 8037
+    _MODELCONFIG_PARAMETERSENTRY._serialized_end = 8113
+# @@protoc_insertion_point(module_scope)
--- a/visualdl/server/api.py
+++ b/visualdl/server/api.py
@@ -417,7 +417,10 @@ def get_component_tabs(*apis, vdl_args, request_args):
            all_tabs.update(api('component_tabs', request_args))
            all_tabs.add('static_graph')
    else:
-        return ['static_graph', 'x2paddle', 'fastdeploy_server']
+        return [
+            'static_graph', 'x2paddle', 'fastdeploy_server',
+            'fastdeploy_client'
+        ]
    return list(all_tabs)

--- a/visualdl/server/app.py
+++ b/visualdl/server/app.py
@@ -13,12 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =======================================================================
+import json
 import multiprocessing
 import os
 import re
 import sys
 import threading
 import time
+import urllib
 import webbrowser
 import requests
@@ -32,6 +34,8 @@ from flask_babel import Babel
 import visualdl.server
 from visualdl import __version__
+from visualdl.component.inference.fastdeploy_lib import get_start_arguments
+from visualdl.component.inference.fastdeploy_server import create_fastdeploy_api_call
 from visualdl.component.inference.model_convert_server import create_model_convert_api_call
 from visualdl.component.profiler.profiler_server import create_profiler_api_call
 from visualdl.server.api import create_api_call
@@ -71,6 +75,7 @@ def create_app(args):  # noqa: C901
    api_call = create_api_call(args.logdir, args.model, args.cache_timeout)
    profiler_api_call = create_profiler_api_call(args.logdir)
    inference_api_call = create_model_convert_api_call()
+    fastdeploy_api_call = create_fastdeploy_api_call()
    if args.telemetry:
        update_util.PbUpdater(args.product).start()
@@ -153,6 +158,141 @@ def create_app(args):  # noqa: C901
        return make_response(
            Response(data, mimetype=mimetype, headers=headers))
+    @app.route(api_path + '/fastdeploy/<path:method>', methods=["GET", "POST"])
+    def serve_fastdeploy_api(method):
+        if request.method == 'POST':
+            data, mimetype, headers = fastdeploy_api_call(method, request.form)
+        else:
+            data, mimetype, headers = fastdeploy_api_call(method, request.args)
+        return make_response(
+            Response(data, mimetype=mimetype, headers=headers))
+    @app.route(
+        api_path + '/fastdeploy/fastdeploy_client', methods=["GET", "POST"])
+    def serve_fastdeploy_create_fastdeploy_client():
+        try:
+            if request.method == 'POST':
+                fastdeploy_api_call('create_fastdeploy_client', request.form)
+                request_args = request.form
+            else:
+                fastdeploy_api_call('create_fastdeploy_client', request.args)
+                request_args = request.args
+        except Exception as e:
+            error_msg = '{}'.format(e)
+            return make_response(error_msg)
+        args = urllib.parse.urlencode(request_args)
+        if args:
+            return redirect(
+                api_path + "/fastdeploy/fastdeploy_client/app?{}".format(args),
+                code=302)
+        return redirect(
+            api_path + "/fastdeploy/fastdeploy_client/app", code=302)
+    @app.route(
+        api_path + "/fastdeploy/fastdeploy_client/<path:path>",
+        methods=["GET", "POST"])
+    def request_fastdeploy_create_fastdeploy_client_app(path: str):
+        '''
+        Gradio app server url interface. We route urls for gradio app to gradio server.
+        Args:
+            path(str): All resource path from gradio server.
+        Returns:
+            Any thing from gradio server.
+        '''
+        if request.method == 'POST':
+            port = fastdeploy_api_call('create_fastdeploy_client',
+                                       request.form)
+            request_args = request.form
+        else:
+            port = fastdeploy_api_call('create_fastdeploy_client',
+                                       request.args)
+            request_args = request.args
+        if path == 'app':
+            proxy_url = request.url.replace(
+                request.host_url.rstrip('/') + api_path +
+                '/fastdeploy/fastdeploy_client/app',
+                'http://localhost:{}/'.format(port))
+        else:
+            proxy_url = request.url.replace(
+                request.host_url.rstrip('/') + api_path +
+                '/fastdeploy/fastdeploy_client/',
+                'http://localhost:{}/'.format(port))
+        resp = requests.request(
+            method=request.method,
+            url=proxy_url,
+            headers={
+                key: value
+                for (key, value) in request.headers if key != 'Host'
+            },
+            data=request.get_data(),
+            cookies=request.cookies,
+            allow_redirects=False)
+        if path == 'app':
+            content = resp.content
+            if request_args and 'server_id' in request_args:
+                server_id = request_args.get('server_id')
+                start_args = get_start_arguments(server_id)
+                http_port = start_args.get('http-port', '')
+                metrics_port = start_args.get('metrics-port', '')
+                model_name = start_args.get('default_model_name', '')
+                content = content.decode()
+                try:
+                    default_server_addr = re.search(
+                        '"label": {}.*?"value": "".*?}}'.format(
+                            json.dumps("服务ip", ensure_ascii=True).replace(
+                                '\\', '\\\\')), content).group(0)
+                    cur_server_addr = default_server_addr.replace(
+                        '"value": ""', '"value": "localhost"')
+                    default_http_port = re.search(
+                        '"label": {}.*?"value": "".*?}}'.format(
+                            json.dumps("推理服务端口", ensure_ascii=True).replace(
+                                '\\', '\\\\')), content).group(0)
+                    cur_http_port = default_http_port.replace(
+                        '"value": ""', '"value": "{}"'.format(http_port))
+                    default_metrics_port = re.search(
+                        '"label": {}.*?"value": "".*?}}'.format(
+                            json.dumps("性能服务端口", ensure_ascii=True).replace(
+                                '\\', '\\\\')), content).group(0)
+                    cur_metrics_port = default_metrics_port.replace(
+                        '"value": ""', '"value": "{}"'.format(metrics_port))
+                    default_model_name = re.search(
+                        '"label": {}.*?"value": "".*?}}'.format(
+                            json.dumps("模型名称", ensure_ascii=True).replace(
+                                '\\', '\\\\')), content).group(0)
+                    cur_model_name = default_model_name.replace(
+                        '"value": ""', '"value": "{}"'.format(model_name))
+                    default_model_version = re.search(
+                        '"label": {}.*?"value": "".*?}}'.format(
+                            json.dumps("模型版本", ensure_ascii=True).replace(
+                                '\\', '\\\\')), content).group(0)
+                    cur_model_version = default_model_version.replace(
+                        '"value": ""', '"value": "{}"'.format('1'))
+                    content = content.replace(default_server_addr,
+                                              cur_server_addr)
+                    if http_port:
+                        content = content.replace(default_http_port,
+                                                  cur_http_port)
+                    if metrics_port:
+                        content = content.replace(default_metrics_port,
+                                                  cur_metrics_port)
+                    if model_name:
+                        content = content.replace(default_model_name,
+                                                  cur_model_name)
+                    content = content.replace(default_model_version,
+                                              cur_model_version)
+                except Exception:
+                    pass
+                finally:
+                    content = content.encode()
+        else:
+            content = resp.content
+        headers = [(name, value) for (name, value) in resp.raw.headers.items()]
+        response = Response(content, resp.status_code, headers)
+        return response
    @app.route(api_path + '/component_tabs')
    def component_tabs():
        data, mimetype, headers = get_component_tabs(

--- a/visualdl/server/args.py
+++ b/visualdl/server/args.py
@@ -78,7 +78,8 @@ def validate_args(args):
    supported_tabs = [
        'scalar', 'image', 'text', 'embeddings', 'audio', 'histogram',
        'hyper_parameters', 'static_graph', 'dynamic_graph', 'pr_curve',
-        'roc_curve', 'profiler', 'x2paddle', 'fastdeploy_server'
+        'roc_curve', 'profiler', 'x2paddle', 'fastdeploy_server',
+        'fastdeploy_client'
    ]
    if args.component_tabs is not None:
        for component_tab in args.component_tabs:

--- a/visualdl/utils/dir.py
+++ b/visualdl/utils/dir.py
@@ -23,6 +23,7 @@ USER_HOME = os.path.expanduser('~')
 VDL_HOME = os.path.join(USER_HOME, '.visualdl')
 CONF_HOME = os.path.join(VDL_HOME, 'conf')
 CONFIG_PATH = os.path.join(CONF_HOME, 'config.json')
+FASTDEPLOYSERVER_PATH = os.path.join(VDL_HOME, 'fastdeployserver')
 X2PADDLE_CACHE_PATH = os.path.join(VDL_HOME, 'x2paddle')
@@ -32,5 +33,7 @@ def init_vdl_config():
    if not os.path.exists(CONFIG_PATH) or 0 == os.path.getsize(CONFIG_PATH):
        with open(CONFIG_PATH, 'w') as fp:
            fp.write(json.dumps(default_vdl_config))
+    if not os.path.exists(FASTDEPLOYSERVER_PATH):
+        os.makedirs(FASTDEPLOYSERVER_PATH, exist_ok=True)
    if not os.path.exists(X2PADDLE_CACHE_PATH):
        os.makedirs(X2PADDLE_CACHE_PATH, exist_ok=True)