diff --git a/doc/Serving_Configure_CN.md b/doc/Serving_Configure_CN.md index 0c145abcc356d20d2c6c45c5914430de6ed01f69..5b42221a894de54c4c46e23c254f62d464c9bc4f 100644 --- a/doc/Serving_Configure_CN.md +++ b/doc/Serving_Configure_CN.md @@ -364,11 +364,41 @@ dag: tracer: interval_s: 10 + #client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测 + #client_type: local_predictor + + #channel的最大长度,默认为0 + #channel_size: 0 + + #针对大模型分布式场景tensor并行,接收第一个返回结果后其他结果丢弃来提供速度 + #channel_recv_frist_arrive: False + op: det: #并发数,is_thread_op=True时,为线程并发;否则为进程并发 concurrency: 6 + #Serving IPs + #server_endpoints: ["127.0.0.1:9393"] + + #Fetch结果列表,以client_config中fetch_var的alias_name为准 + #fetch_list: ["concat_1.tmp_0"] + + #det模型client端配置 + #client_config: serving_client_conf.prototxt + + #Serving交互超时时间, 单位ms + #timeout: 3000 + + #Serving交互重试次数,默认不重试 + #retry: 1 + + # 批量查询Serving的数量, 默认1。batch_size>1要设置auto_batching_timeout,否则不足batch_size时会阻塞 + #batch_size: 2 + + # 批量查询超时,与batch_size配合使用 + #auto_batching_timeout: 2000 + #当op配置没有server_endpoints时,从local_service_conf读取本地服务配置 local_service_conf: #client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测 @@ -399,6 +429,27 @@ op: #GPU 支持: "fp32"(default), "fp16", "int8"; #CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8" precision: "fp32" + + #mem_optim, memory / graphic memory optimization + #mem_optim: True + + #use_calib, Use TRT int8 calibration + #use_calib: False + + #use_mkldnn, Use mkldnn for cpu + #use_mkldnn: False + + #The cache capacity of different input shapes for mkldnn + #mkldnn_cache_capacity: 0 + + #mkldnn_op_list, op list accelerated using MKLDNN, None default + #mkldnn_op_list: [] + + #mkldnn_bf16_op_list,op list accelerated using MKLDNN bf16, None default. + #mkldnn_bf16_op_list: [] + + #min_subgraph_size,the minimal subgraph size for opening tensorrt to optimize, 3 default + #min_subgraph_size: 3 rec: #并发数,is_thread_op=True时,为线程并发;否则为进程并发 concurrency: 3 diff --git a/doc/Serving_Configure_EN.md b/doc/Serving_Configure_EN.md index d586db6d0314f5af7d0fc2b217b389552a5034a2..1a3e4bd180c7d2e5c39aca550960c6fb8c551e6e 100644 --- a/doc/Serving_Configure_EN.md +++ b/doc/Serving_Configure_EN.md @@ -369,11 +369,41 @@ dag: tracer: interval_s: 10 + #client type,include brpc, grpc and local_predictor. + #client_type: local_predictor + + # max channel size, default 0 + #channel_size: 0 + + #For distributed large model scenario with tensor parallelism, the first result is received and the other results are discarded to provide speed + #channel_recv_frist_arrive: False + op: det: #concurrency,is_thread_op=True,thread otherwise process concurrency: 6 + #Serving IPs + #server_endpoints: ["127.0.0.1:9393"] + + #Fetch data list + #fetch_list: ["concat_1.tmp_0"] + + #det client config + #client_config: serving_client_conf.prototxt + + #Serving timeout, ms + #timeout: 3000 + + #Serving retry times + #retry: 1 + + #Default 1。batch_size>1 should set auto_batching_timeout + #batch_size: 2 + + #Batching timeout,used with batch_size + #auto_batching_timeout: 2000 + #Loading local server configuration without server_endpoints. local_service_conf: #client type,include brpc, grpc and local_predictor. @@ -404,6 +434,27 @@ op: #GPU 支持: "fp32"(default), "fp16", "int8"; #CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8" precision: "fp32" + + #mem_optim, memory / graphic memory optimization + #mem_optim: True + + #use_calib, Use TRT int8 calibration + #use_calib: False + + #use_mkldnn, Use mkldnn for cpu + #use_mkldnn: False + + #The cache capacity of different input shapes for mkldnn + #mkldnn_cache_capacity: 0 + + #mkldnn_op_list, op list accelerated using MKLDNN, None default + #mkldnn_op_list: [] + + #mkldnn_bf16_op_list,op list accelerated using MKLDNN bf16, None default. + #mkldnn_bf16_op_list: [] + + #min_subgraph_size,the minimal subgraph size for opening tensorrt to optimize, 3 default + #min_subgraph_size: 3 rec: #concurrency,is_thread_op=True,thread otherwise process concurrency: 3