diff --git a/python/examples/pipeline/imagenet/config.yml b/python/examples/pipeline/imagenet/config.yml index 52ddab6f3194efe7c884411bfbcd381f76ea075e..6e48018f2867c51d19e646521aeccf3394537f79 100644 --- a/python/examples/pipeline/imagenet/config.yml +++ b/python/examples/pipeline/imagenet/config.yml @@ -20,6 +20,9 @@ op: #uci模型路径 model_config: ResNet50_vd_model + #计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu + device_type: 1 + #计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 devices: "0" # "0,1" diff --git a/python/examples/pipeline/simple_web_service/config.yml b/python/examples/pipeline/simple_web_service/config.yml index f08f95b953c3b4871f33288d75ed8d31b02bc2c3..52e674099a7ba4647b4587da7da8f7f59e10e0d5 100644 --- a/python/examples/pipeline/simple_web_service/config.yml +++ b/python/examples/pipeline/simple_web_service/config.yml @@ -20,7 +20,10 @@ op: #uci模型路径 model_config: uci_housing_model - #计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 + #计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu + device_type: 0 + + #计算硬件ID,优先由device_type决定硬件类型。devices为""或空缺时为CPU预测;当为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 devices: "" # "0,1" #client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测 diff --git a/python/paddle_serving_app/local_predict.py b/python/paddle_serving_app/local_predict.py index 2a2fcabea89f2e44fad963faace696d7d0af5c93..5a641fe6358a62b67c435e9881d481c2c5616b1f 100644 --- a/python/paddle_serving_app/local_predict.py +++ b/python/paddle_serving_app/local_predict.py @@ -20,6 +20,7 @@ import google.protobuf.text_format import numpy as np import argparse import paddle.fluid as fluid +import paddle.inference as inference from .proto import general_model_config_pb2 as m_config from paddle.fluid.core import PaddleTensor from paddle.fluid.core import AnalysisConfig @@ -125,14 +126,13 @@ class LocalPredictor(object): if use_lite: config.enable_lite_engine( - precision_mode = PrecisionType.Float32, - zero_copy = True, - passes_filter = [], - ops_filter = [] - ) + precision_mode=inference.PrecisionType.Float32, + zero_copy=True, + passes_filter=[], + ops_filter=[]) if use_xpu: - config.enable_xpu(100 * 1024 * 1024) + config.enable_xpu(8 * 1024 * 1024) self.predictor = create_paddle_predictor(config) diff --git a/python/pipeline/local_service_handler.py b/python/pipeline/local_service_handler.py index f519ca2d115128bc6a6e5778dba992bc82bda5c1..eaa04ee01411260f82992d4327c9d8ac033b91f0 100644 --- a/python/pipeline/local_service_handler.py +++ b/python/pipeline/local_service_handler.py @@ -38,14 +38,12 @@ class LocalServiceHandler(object): client_type='local_predictor', workdir="", thread_num=2, + device_type=-1, devices="", fetch_names=None, mem_optim=True, ir_optim=False, available_port_generator=None, - use_trt=False, - use_lite=False, - use_xpu=False, use_profile=False): """ Initialization of localservicehandler @@ -55,15 +53,14 @@ class LocalServiceHandler(object): client_type: brpc, grpc and local_predictor[default] workdir: work directory thread_num: number of threads, concurrent quantity. + device_type: support multiple devices. -1=Not set, determined by + `devices`. 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu devices: gpu id list[gpu], "" default[cpu] fetch_names: get fetch names out of LocalServiceHandler in local_predictor mode. fetch_names_ is compatible for Client(). mem_optim: use memory/graphics memory optimization, True default. ir_optim: use calculation chart optimization, False default. available_port_generator: generate available ports - use_trt: use nvidia tensorRt engine, False default. - use_lite: use Paddle-Lite engine, False default. - use_xpu: run predict on Baidu Kunlun, False default. use_profile: use profiling, False default. Returns: @@ -74,28 +71,61 @@ class LocalServiceHandler(object): self._model_config = model_config self._port_list = [] - self._device_type = "cpu" - if devices == "": - # cpu - devices = [-1] - if use_lite: - self._device_type = "arm" - self._port_list.append(available_port_generator.next()) - _LOGGER.info("Model({}) will be launch in arm device. Port({})" - .format(model_config, self._port_list)) + self._device_name = "cpu" + self._use_gpu = False + self._use_trt = False + self._use_lite = False + self._use_xpu = False + + if device_type == -1: + # device_type is not set, determined by `devices`, + if devices == "": + # CPU + self._device_name = "cpu" + devices = [-1] else: - self._device_type = "cpu" - self._port_list.append(available_port_generator.next()) - _LOGGER.info("Model({}) will be launch in cpu device. Port({})" - .format(model_config, self._port_list)) - else: - # gpu - self._device_type = "gpu" + # GPU + self._device_name = "gpu" + self._use_gpu = True + devices = [int(x) for x in devices.split(",")] + + elif device_type == 0: + # CPU + self._device_name = "cpu" + devices = [-1] + elif device_type == 1: + # GPU + self._device_name = "gpu" + self._use_gpu = True + devices = [int(x) for x in devices.split(",")] + elif device_type == 2: + # Nvidia Tensor RT + self._device_name = "gpu" + self._use_gpu = True devices = [int(x) for x in devices.split(",")] + self._use_trt = True + elif device_type == 3: + # ARM CPU + self._device_name = "arm" + devices = [-1] + self._use_lite = True + elif device_type == 4: + # Kunlun XPU + self._device_name = "arm" + devices = [int(x) for x in devices.split(",")] + self._use_lite = True + self._use_xpu = True + else: + _LOGGER.error( + "LocalServiceHandler initialization fail. device_type={}" + .format(device_type)) + + if client_type == "brpc" or client_type == "grpc": for _ in devices: self._port_list.append(available_port_generator.next()) - _LOGGER.info("Model({}) will be launch in gpu device: {}. Port({})" - .format(model_config, devices, self._port_list)) + _LOGGER.info("Create ports for devices:{}. Port:{}" + .format(devices, self._port_list)) + self._client_type = client_type self._workdir = workdir self._devices = devices @@ -105,14 +135,21 @@ class LocalServiceHandler(object): self._local_predictor_client = None self._rpc_service_list = [] self._server_pros = [] - self._use_trt = use_trt - self._use_lite = use_lite - self._use_xpu = use_xpu self._use_profile = use_profile - self.fetch_names_ = fetch_names + self._fetch_names = fetch_names + + _LOGGER.info( + "Models({}) will be launched by device {}. use_gpu:{}, " + "use_trt:{}, use_lite:{}, use_xpu:{}, device_type:{}, devices:{}, " + "mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, " + "client_type:{}, fetch_names:{}".format( + model_config, self._device_name, self._use_gpu, self._use_trt, + self._use_lite, self._use_xpu, device_type, self._devices, + self._mem_optim, self._ir_optim, self._use_profile, + self._thread_num, self._client_type, self._fetch_names)) def get_fetch_list(self): - return self.fetch_names_ + return self._fetch_names def get_port_list(self): return self._port_list @@ -149,22 +186,17 @@ class LocalServiceHandler(object): from paddle_serving_app.local_predict import LocalPredictor if self._local_predictor_client is None: self._local_predictor_client = LocalPredictor() - use_gpu = False - use_lite = False - if self._device_type == "gpu": - use_gpu = True - elif self._device_type == "arm": - use_lite = True + self._local_predictor_client.load_model_config( model_path=self._model_config, - use_gpu=use_gpu, + use_gpu=self._use_gpu, gpu_id=self._devices[concurrency_idx], use_profile=self._use_profile, thread_num=self._thread_num, mem_optim=self._mem_optim, ir_optim=self._ir_optim, use_trt=self._use_trt, - use_lite=use_lite, + use_lite=self._use_lite, use_xpu=self._use_xpu) return self._local_predictor_client @@ -174,7 +206,7 @@ class LocalServiceHandler(object): def _prepare_one_server(self, workdir, port, gpuid, thread_num, mem_optim, ir_optim): """ - According to _device_type, generating one CpuServer or GpuServer, and + According to self._device_name, generating one Cpu/Gpu/Arm Server, and setting the model config amd startup params. Args: @@ -188,7 +220,7 @@ class LocalServiceHandler(object): Returns: server: CpuServer/GpuServer """ - if self._device_type == "cpu": + if self._device_name == "cpu": from paddle_serving_server import OpMaker, OpSeqMaker, Server op_maker = OpMaker() read_op = op_maker.create('general_reader') @@ -225,9 +257,9 @@ class LocalServiceHandler(object): server.load_model_config(self._model_config) server.prepare_server( - workdir=workdir, port=port, device=self._device_type) - if self.fetch_names_ is None: - self.fetch_names_ = server.get_fetch_list() + workdir=workdir, port=port, device=self._device_name) + if self._fetch_names is None: + self._fetch_names = server.get_fetch_list() return server def _start_one_server(self, service_idx): @@ -264,7 +296,7 @@ class LocalServiceHandler(object): """ Start multiple processes and start one server in each process """ - for i, service in enumerate(self._rpc_service_list): + for i, _ in enumerate(self._rpc_service_list): p = multiprocessing.Process( target=self._start_one_server, args=(i, )) p.daemon = True diff --git a/python/pipeline/operator.py b/python/pipeline/operator.py index 4f488f6538f9faa2ae705378d5a0ae99538a6e5d..dda992c7d8adc6b73cb0d156c4a30a0badcc41b1 100644 --- a/python/pipeline/operator.py +++ b/python/pipeline/operator.py @@ -134,6 +134,7 @@ class Op(object): self.model_config = None self.workdir = None self.thread_num = self.concurrency + self.device_type = -1 self.devices = "" self.mem_optim = False self.ir_optim = False @@ -153,6 +154,7 @@ class Op(object): self.client_type = local_service_conf.get("client_type") self.workdir = local_service_conf.get("workdir") self.thread_num = local_service_conf.get("thread_num") + self.device_type = local_service_conf.get("device_type") self.devices = local_service_conf.get("devices") self.mem_optim = local_service_conf.get("mem_optim") self.ir_optim = local_service_conf.get("ir_optim") @@ -168,6 +170,7 @@ class Op(object): client_type=self.client_type, workdir=self.workdir, thread_num=self.thread_num, + device_type=self.device_type, devices=self.devices, mem_optim=self.mem_optim, ir_optim=self.ir_optim) @@ -188,8 +191,11 @@ class Op(object): client_type=self.client_type, workdir=self.workdir, thread_num=self.thread_num, + device_type=self.device_type, devices=self.devices, - fetch_names=self._fetch_names) + fetch_names=self._fetch_names, + mem_optim=self.mem_optim, + ir_optim=self.ir_optim) if self._client_config is None: self._client_config = service_handler.get_client_config( ) @@ -550,7 +556,8 @@ class Op(object): args=(concurrency_idx, self._get_input_channel(), self._get_output_channels(), False, trace_buffer, self.model_config, self.workdir, self.thread_num, - self.devices, self.mem_optim, self.ir_optim)) + self.device_type, self.devices, self.mem_optim, + self.ir_optim)) p.daemon = True p.start() process.append(p) @@ -583,7 +590,8 @@ class Op(object): args=(concurrency_idx, self._get_input_channel(), self._get_output_channels(), True, trace_buffer, self.model_config, self.workdir, self.thread_num, - self.devices, self.mem_optim, self.ir_optim)) + self.device_type, self.devices, self.mem_optim, + self.ir_optim)) # When a process exits, it attempts to terminate # all of its daemonic child processes. t.daemon = True @@ -991,7 +999,7 @@ class Op(object): def _run(self, concurrency_idx, input_channel, output_channels, is_thread_op, trace_buffer, model_config, workdir, thread_num, - devices, mem_optim, ir_optim): + device_type, devices, mem_optim, ir_optim): """ _run() is the entry function of OP process / thread model.When client type is local_predictor in process mode, the CUDA environment needs to @@ -1009,6 +1017,7 @@ class Op(object): model_config: model config path workdir: work directory thread_num: number of threads, concurrent quantity + device_type: support multiple devices devices: gpu id list[gpu], "" default[cpu] mem_optim: use memory/graphics memory optimization, True default. ir_optim: use calculation chart optimization, False default. @@ -1017,7 +1026,6 @@ class Op(object): None """ op_info_prefix = "[{}|{}]".format(self.name, concurrency_idx) - tid = threading.current_thread().ident # init ops profiler = None @@ -1028,6 +1036,7 @@ class Op(object): client_type="local_predictor", workdir=workdir, thread_num=thread_num, + device_type=device_type, devices=devices, mem_optim=mem_optim, ir_optim=ir_optim) diff --git a/python/pipeline/pipeline_server.py b/python/pipeline/pipeline_server.py index ca865d743d5bd5f66ef4f32e6da4722773508785..9043540792730db6c9349243277a63a0565e01c1 100644 --- a/python/pipeline/pipeline_server.py +++ b/python/pipeline/pipeline_server.py @@ -234,6 +234,7 @@ class PipelineServer(object): "local_service_conf": { "workdir": "", "thread_num": 2, + "device_type": -1, "devices": "", "mem_optim": True, "ir_optim": False, @@ -389,6 +390,7 @@ class ServerYamlConfChecker(object): default_conf = { "workdir": "", "thread_num": 2, + "device_type": -1, "devices": "", "mem_optim": True, "ir_optim": False, @@ -397,6 +399,7 @@ class ServerYamlConfChecker(object): "model_config": str, "workdir": str, "thread_num": int, + "device_type": int, "devices": str, "mem_optim": bool, "ir_optim": bool,