support xpu inference (#3307)

* support xpu inference

support xpu inference (#3307)
* support xpu inference
44b83132 · Guanghua Yu · GitHub · 5146077c · 44b83132 · 44b83132
20 changed file
--- a/deploy/cpp/docs/Jetson_build.md
+++ b/deploy/cpp/docs/Jetson_build.md
@@ -159,7 +159,7 @@ CUDNN_LIB=/usr/lib/aarch64-linux-gnu/
 | --image_dir  |  要预测的图片文件夹路径   |
 | --video_file  | 要预测的视频文件路径 |
 | --camera_id | Option | 用来预测的摄像头ID，默认为-1（表示不使用摄像头预测）|
-| --use_gpu  | 是否使用 GPU 预测, 支持值为0或1(默认值为0)|
+| --device  | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`|
 | --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
 | --run_mode | 使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
 | --batch_size |预测时的batch size，在指定`image_dir`时有效 |
@@ -183,7 +183,7 @@ CUDNN_LIB=/usr/lib/aarch64-linux-gnu/
 `样例二`:
 ```shell
 #使用 `GPU`预测视频`/root/projects/videos/test.mp4`
-./main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --use_gpu=1
+./main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --device=GPU
 ```
 视频文件目前支持`.mp4`格式的预测，`可视化预测结果`会保存在当前目录下`output.mp4`文件中。

--- a/deploy/cpp/docs/linux_build.md
+++ b/deploy/cpp/docs/linux_build.md
@@ -101,7 +101,7 @@ make
 | --image_dir  |  要预测的图片文件夹路径   |
 | --video_file  | 要预测的视频文件路径 |
 | --camera_id | Option | 用来预测的摄像头ID，默认为-1（表示不使用摄像头预测）|
-| --use_gpu  | 是否使用 GPU 预测, 支持值为0或1(默认值为0)|
+| --device  | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`|
 | --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
 | --run_mode | 使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
 | --batch_size  | 预测时的batch size，在指定`image_dir`时有效 |
@@ -125,7 +125,7 @@ make
 `样例二`:
 ```shell
 #使用 `GPU`预测视频`/root/projects/videos/test.mp4`
-./build/main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --use_gpu=1
+./build/main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --device=GPU
 ```
 视频文件目前支持`.mp4`格式的预测，`可视化预测结果`会保存在当前目录下`output.mp4`文件中。

--- a/deploy/cpp/docs/windows_vs2019_build.md
+++ b/deploy/cpp/docs/windows_vs2019_build.md
@@ -96,7 +96,7 @@ cd D:\projects\PaddleDetection\deploy\cpp\out\build\x64-Release
 | --image_dir  |  要预测的图片文件夹路径   |
 | --video_file  | 要预测的视频文件路径 |
 | --camera_id | Option | 用来预测的摄像头ID，默认为-1（表示不使用摄像头预测）|
-| --use_gpu  | 是否使用 GPU 预测, 支持值为0或1(默认值为0)|
+| --device  | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`|
 | --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
 | --run_mode | 使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
 | --batch_size  | 预测时的batch size，在指定`image_dir`时有效 |
@@ -122,7 +122,7 @@ cd D:\projects\PaddleDetection\deploy\cpp\out\build\x64-Release
 `样例二`:
 ```shell
 #使用`GPU`测试视频 `D:\\videos\\test.mp4`  
-.\main --model_dir=D:\\models\\yolov3_darknet --video_path=D:\\videos\\test.mp4 --use_gpu=1
+.\main --model_dir=D:\\models\\yolov3_darknet --video_path=D:\\videos\\test.mp4 --device=GPU
 ```
 视频文件目前支持`.mp4`格式的预测，`可视化预测结果`会保存在当前目录下`output.mp4`文件中。

--- a/deploy/cpp/include/object_detector.h
+++ b/deploy/cpp/include/object_detector.h
@@ -58,7 +58,7 @@ cv::Mat VisualizeResult(const cv::Mat& img,
 class ObjectDetector {
 public:
  explicit ObjectDetector(const std::string& model_dir, 
-                          bool use_gpu=false,
+                          const std::string& device="CPU",
                          bool use_mkldnn=false,
                          int cpu_threads=1,
                          const std::string& run_mode="fluid",
@@ -68,7 +68,7 @@ class ObjectDetector {
                          const int trt_max_shape=1280,
                          const int trt_opt_shape=640,
                          bool trt_calib_mode=false) {
-    this->use_gpu_ = use_gpu;
+    this->device_ = device;
    this->gpu_id_ = gpu_id;
    this->cpu_math_library_num_threads_ = cpu_threads;
    this->use_mkldnn_ = use_mkldnn;
@@ -106,7 +106,7 @@ class ObjectDetector {
  }
 private:
-  bool use_gpu_ = false;
+  std::string device_ = "CPU";
  int gpu_id_ = 0;
  int cpu_math_library_num_threads_ = 1;
  bool use_mkldnn_ = false;

--- a/deploy/cpp/src/main.cc
+++ b/deploy/cpp/src/main.cc
@@ -22,6 +22,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <math.h>
+#include <algorithm>
 #ifdef _WIN32
 #include <direct.h>
@@ -41,7 +42,8 @@ DEFINE_string(image_dir, "", "Dir of input image, `image_file` has a higher prio
 DEFINE_int32(batch_size, 1, "batch_size");
 DEFINE_string(video_file, "", "Path of input video, `video_file` or `camera_id` has a highest priority.");
 DEFINE_int32(camera_id, -1, "Device id of camera to predict");
-DEFINE_bool(use_gpu, false, "Infering with GPU or CPU");
+DEFINE_bool(use_gpu, false, "Deprecated, please use `--device` to set the device you want to run.");
+DEFINE_string(device, "CPU", "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU.");
 DEFINE_double(threshold, 0.5, "Threshold of score.");
 DEFINE_string(output_dir, "output", "Directory of output visualization files.");
 DEFINE_string(run_mode, "fluid", "Mode of running(fluid/trt_fp32/trt_fp16/trt_int8)");
@@ -56,7 +58,7 @@ DEFINE_bool(trt_calib_mode, false, "If the model is produced by TRT offline quan
 void PrintBenchmarkLog(std::vector<double> det_time, int img_num){
  LOG(INFO) << "----------------------- Config info -----------------------";
-  LOG(INFO) << "runtime_device: " << (FLAGS_use_gpu ? "gpu" : "cpu");
+  LOG(INFO) << "runtime_device: " << FLAGS_device;
  LOG(INFO) << "ir_optim: " << "True";
  LOG(INFO) << "enable_memory_optim: " << "True";
  int has_trt = FLAGS_run_mode.find("trt");
@@ -78,7 +80,7 @@ void PrintBenchmarkLog(std::vector<double> det_time, int img_num){
  LOG(INFO) << "model_name: " << FLAGS_model_dir.substr(FLAGS_model_dir.find_last_of('/') + 1);
  LOG(INFO) << "----------------------- Perf info ------------------------";
  LOG(INFO) << "Total number of predicted data: " << img_num
-            << " and total time spent(s): "
+            << " and total time spent(ms): "
            << std::accumulate(det_time.begin(), det_time.end(), 0);
  LOG(INFO) << "preproce_time(ms): " << det_time[0] / img_num
            << ", inference_time(ms): " << det_time[1] / img_num
@@ -358,8 +360,17 @@ int main(int argc, char** argv) {
    std::cout << "run_mode should be 'fluid', 'trt_fp32', 'trt_fp16' or 'trt_int8'.";
    return -1;
  }
+  transform(FLAGS_device.begin(),FLAGS_device.end(),FLAGS_device.begin(),::toupper);
+  if (!(FLAGS_device == "CPU" || FLAGS_device == "GPU" || FLAGS_device == "XPU")) {
+    std::cout << "device should be 'CPU', 'GPU' or 'XPU'.";
+    return -1;
+  }
+  if (FLAGS_use_gpu) {
+    std::cout << "Deprecated, please use `--device` to set the device you want to run.";
+    return -1;
+  }
  // Load model and create a object detector
-  PaddleDetection::ObjectDetector det(FLAGS_model_dir, FLAGS_use_gpu, FLAGS_use_mkldnn,
+  PaddleDetection::ObjectDetector det(FLAGS_model_dir, FLAGS_device, FLAGS_use_mkldnn,
                        FLAGS_cpu_threads, FLAGS_run_mode, FLAGS_batch_size,FLAGS_gpu_id,
                        FLAGS_trt_min_shape, FLAGS_trt_max_shape, FLAGS_trt_opt_shape,
 			FLAGS_trt_calib_mode);

--- a/deploy/cpp/src/object_detector.cc
+++ b/deploy/cpp/src/object_detector.cc
@@ -30,7 +30,7 @@ void ObjectDetector::LoadModel(const std::string& model_dir,
  std::string prog_file = model_dir + OS_PATH_SEP + "model.pdmodel";
  std::string params_file = model_dir + OS_PATH_SEP + "model.pdiparams";
  config.SetModel(prog_file, params_file);
-  if (this->use_gpu_) {
+  if (this->device_ == "GPU") {
    config.EnableUseGpu(200, this->gpu_id_);
    config.SwitchIrOptim(true);
    // use tensorrt
@@ -73,6 +73,8 @@ void ObjectDetector::LoadModel(const std::string& model_dir,
      }
    }
+  } else if (this->device_ == "XPU"){
+    config.EnableXpu(10*1024*1024);
  } else {
    config.DisableGpu();
    if (this->use_mkldnn_) {

--- a/deploy/python/README.md
+++ b/deploy/python/README.md
@@ -21,26 +21,26 @@ PaddleDetection在训练过程包括网络的前向和优化器相关参数，
 在终端输入以下命令进行预测：
 ```bash
-python deploy/python/infer.py --model_dir=./inference/yolov3_mobilenet_v1_roadsign --image_file=./demo/road554.png --use_gpu=True
+python deploy/python/infer.py --model_dir=./inference/yolov3_mobilenet_v1_roadsign --image_file=./demo/road554.png --device=GPU
 ```
 参数说明如下:
 | 参数 | 是否必须|含义 |
 |-------|-------|----------|
-| --model_dir | Yes|上述导出的模型路径 |
+| --model_dir | Yes| 上述导出的模型路径 |
-| --image_file | Option |需要预测的图片 |
+| --image_file | Option | 需要预测的图片 |
 | --image_dir  | Option |  要预测的图片文件夹路径   |
-| --video_file | Option |需要预测的视频 |
+| --video_file | Option | 需要预测的视频 |
 | --camera_id | Option | 用来预测的摄像头ID，默认为-1(表示不使用摄像头预测，可设置为：0 - (摄像头数目-1) )，预测过程中在可视化界面按`q`退出输出预测结果到：output/output.mp4|
-| --use_gpu | No |是否GPU，默认为False|
+| --device | Option | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`|
-| --run_mode | No |使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
+| --run_mode | Option |使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
-| --batch_size | No |预测时的batch size，在指定`image_dir`时有效 |
+| --batch_size | Option |预测时的batch size，在指定`image_dir`时有效，默认为1 |
-| --threshold | No|预测得分的阈值，默认为0.5|
+| --threshold | Option|预测得分的阈值，默认为0.5|
-| --output_dir | No|可视化结果保存的根目录，默认为output/|
+| --output_dir | Option|可视化结果保存的根目录，默认为output/|
-| --run_benchmark | No| 是否运行benchmark，同时需指定`--image_file`或`--image_dir` |
+| --run_benchmark | Option| 是否运行benchmark，同时需指定`--image_file`或`--image_dir`，默认为False |
-| --enable_mkldnn | No | CPU预测中是否开启MKLDNN加速 |
+| --enable_mkldnn | Option | CPU预测中是否开启MKLDNN加速，默认为False |
-| --cpu_threads | No| 设置cpu线程数，默认为1 |
+| --cpu_threads | Option| 设置cpu线程数，默认为1 |
 说明：

--- a/deploy/python/infer.py
+++ b/deploy/python/infer.py
@@ -49,7 +49,7 @@ class Detector(object):
    Args:
        config (object): config of model, defined by `Config(model_dir)`
        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
-        use_gpu (bool): whether use gpu
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
        run_mode (str): mode of running(fluid/trt_fp32/trt_fp16)
        batch_size (int): size of pre batch in inference
        trt_min_shape (int): min shape for dynamic shape in trt
@@ -62,7 +62,7 @@ class Detector(object):
    def __init__(self,
                 pred_config,
                 model_dir,
-                 use_gpu=False,
+                 device='CPU',
                 run_mode='fluid',
                 batch_size=1,
                 trt_min_shape=1,
@@ -77,7 +77,7 @@ class Detector(object):
            run_mode=run_mode,
            batch_size=batch_size,
            min_subgraph_size=self.pred_config.min_subgraph_size,
-            use_gpu=use_gpu,
+            device=device,
            use_dynamic_shape=self.pred_config.use_dynamic_shape,
            trt_min_shape=trt_min_shape,
            trt_max_shape=trt_max_shape,
@@ -177,7 +177,7 @@ class DetectorSOLOv2(Detector):
    Args:
        config (object): config of model, defined by `Config(model_dir)`
        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
-        use_gpu (bool): whether use gpu
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
        run_mode (str): mode of running(fluid/trt_fp32/trt_fp16)
        batch_size (int): size of pre batch in inference
        trt_min_shape (int): min shape for dynamic shape in trt
@@ -189,7 +189,7 @@ class DetectorSOLOv2(Detector):
    def __init__(self,
                 pred_config,
                 model_dir,
-                 use_gpu=False,
+                 device='CPU',
                 run_mode='fluid',
                 batch_size=1,
                 trt_min_shape=1,
@@ -204,7 +204,7 @@ class DetectorSOLOv2(Detector):
            run_mode=run_mode,
            batch_size=batch_size,
            min_subgraph_size=self.pred_config.min_subgraph_size,
-            use_gpu=use_gpu,
+            device=device,
            use_dynamic_shape=self.pred_config.use_dynamic_shape,
            trt_min_shape=trt_min_shape,
            trt_max_shape=trt_max_shape,
@@ -352,7 +352,7 @@ class PredictConfig():
 def load_predictor(model_dir,
                   run_mode='fluid',
                   batch_size=1,
-                   use_gpu=False,
+                   device='CPU',
                   min_subgraph_size=3,
                   use_dynamic_shape=False,
                   trt_min_shape=1,
@@ -364,7 +364,7 @@ def load_predictor(model_dir,
    """set AnalysisConfig, generate AnalysisPredictor
    Args:
        model_dir (str): root path of __model__ and __params__
-        use_gpu (bool): whether use gpu
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
        run_mode (str): mode of running(fluid/trt_fp32/trt_fp16/trt_int8)
        use_dynamic_shape (bool): use dynamic shape or not
        trt_min_shape (int): min shape for dynamic shape in trt
@@ -375,25 +375,22 @@ def load_predictor(model_dir,
    Returns:
        predictor (PaddlePredictor): AnalysisPredictor
    Raises:
-        ValueError: predict by TensorRT need use_gpu == True.
+        ValueError: predict by TensorRT need device == 'GPU'.
    """
-    if not use_gpu and not run_mode == 'fluid':
+    if device != 'GPU' and run_mode != 'fluid':
        raise ValueError(
-            "Predict by TensorRT mode: {}, expect use_gpu==True, but use_gpu == {}"
+            "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}"
-            .format(run_mode, use_gpu))
+            .format(run_mode, device))
    config = Config(
        os.path.join(model_dir, 'model.pdmodel'),
        os.path.join(model_dir, 'model.pdiparams'))
-    precision_map = {
+    if device == 'GPU':
-        'trt_int8': Config.Precision.Int8,
-        'trt_fp32': Config.Precision.Float32,
-        'trt_fp16': Config.Precision.Half
-    }
-    if use_gpu:
        # initial GPU memory(M), device ID
        config.enable_use_gpu(200, 0)
        # optimize graph and fuse op
        config.switch_ir_optim(True)
+    elif device == 'XPU':
+        config.enable_xpu(10 * 1024 * 1024)
    else:
        config.disable_gpu()
        config.set_cpu_math_library_num_threads(cpu_threads)
@@ -408,6 +405,11 @@ def load_predictor(model_dir,
                )
                pass
+    precision_map = {
+        'trt_int8': Config.Precision.Int8,
+        'trt_fp32': Config.Precision.Float32,
+        'trt_fp16': Config.Precision.Half
+    }
    if run_mode in precision_map.keys():
        config.enable_tensorrt_engine(
            workspace_size=1 << 10,
@@ -582,7 +584,7 @@ def main():
    detector = Detector(
        pred_config,
        FLAGS.model_dir,
-        use_gpu=FLAGS.use_gpu,
+        device=FLAGS.device,
        run_mode=FLAGS.run_mode,
        batch_size=FLAGS.batch_size,
        trt_min_shape=FLAGS.trt_min_shape,
@@ -595,7 +597,7 @@ def main():
        detector = DetectorSOLOv2(
            pred_config,
            FLAGS.model_dir,
-            use_gpu=FLAGS.use_gpu,
+            device=FLAGS.device,
            run_mode=FLAGS.run_mode,
            batch_size=FLAGS.batch_size,
            trt_min_shape=FLAGS.trt_min_shape,
@@ -645,5 +647,9 @@ if __name__ == '__main__':
    parser = argsparser()
    FLAGS = parser.parse_args()
    print_arguments(FLAGS)
+    FLAGS.device = FLAGS.device.upper()
+    assert FLAGS.device in ['CPU', 'GPU', 'XPU'
+                            ], "device should be CPU, GPU or XPU"
+    assert not FLAGS.use_gpu, "use_gpu has been deprecated, please use --device"
    main()
--- a/deploy/python/keypoint_det_unite_infer.py
+++ b/deploy/python/keypoint_det_unite_infer.py
@@ -156,7 +156,7 @@ def main():
    detector = Detector(
        pred_config,
        FLAGS.det_model_dir,
-        use_gpu=FLAGS.use_gpu,
+        device=FLAGS.device,
        run_mode=FLAGS.run_mode,
        trt_min_shape=FLAGS.trt_min_shape,
        trt_max_shape=FLAGS.trt_max_shape,
@@ -169,7 +169,7 @@ def main():
    topdown_keypoint_detector = KeyPoint_Detector(
        pred_config,
        FLAGS.keypoint_model_dir,
-        use_gpu=FLAGS.use_gpu,
+        device=FLAGS.device,
        run_mode=FLAGS.run_mode,
        trt_min_shape=FLAGS.trt_min_shape,
        trt_max_shape=FLAGS.trt_max_shape,
@@ -193,5 +193,8 @@ if __name__ == '__main__':
    parser = argsparser()
    FLAGS = parser.parse_args()
    print_arguments(FLAGS)
+    FLAGS.device = FLAGS.device.upper()
+    assert FLAGS.device in ['CPU', 'GPU', 'XPU'
+                            ], "device should be CPU, GPU or XPU"
    main()
--- a/deploy/python/keypoint_infer.py
+++ b/deploy/python/keypoint_infer.py
@@ -44,7 +44,7 @@ class KeyPoint_Detector(object):
    Args:
        config (object): config of model, defined by `Config(model_dir)`
        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
-        use_gpu (bool): whether use gpu
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
        run_mode (str): mode of running(fluid/trt_fp32/trt_fp16)
        trt_min_shape (int): min shape for dynamic shape in trt
        trt_max_shape (int): max shape for dynamic shape in trt
@@ -56,7 +56,7 @@ class KeyPoint_Detector(object):
    def __init__(self,
                 pred_config,
                 model_dir,
-                 use_gpu=False,
+                 device='CPU',
                 run_mode='fluid',
                 trt_min_shape=1,
                 trt_max_shape=1280,
@@ -69,7 +69,7 @@ class KeyPoint_Detector(object):
            model_dir,
            run_mode=run_mode,
            min_subgraph_size=self.pred_config.min_subgraph_size,
-            use_gpu=use_gpu,
+            device=device,
            use_dynamic_shape=self.pred_config.use_dynamic_shape,
            trt_min_shape=trt_min_shape,
            trt_max_shape=trt_max_shape,
@@ -236,7 +236,7 @@ class PredictConfig_KeyPoint():
 def load_predictor(model_dir,
                   run_mode='fluid',
                   batch_size=1,
-                   use_gpu=False,
+                   device='CPU',
                   min_subgraph_size=3,
                   use_dynamic_shape=False,
                   trt_min_shape=1,
@@ -248,7 +248,7 @@ def load_predictor(model_dir,
    """set AnalysisConfig, generate AnalysisPredictor
    Args:
        model_dir (str): root path of __model__ and __params__
-        use_gpu (bool): whether use gpu
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
        run_mode (str): mode of running(fluid/trt_fp32/trt_fp16/trt_int8)
        use_dynamic_shape (bool): use dynamic shape or not
        trt_min_shape (int): min shape for dynamic shape in trt
@@ -259,25 +259,22 @@ def load_predictor(model_dir,
    Returns:
        predictor (PaddlePredictor): AnalysisPredictor
    Raises:
-        ValueError: predict by TensorRT need use_gpu == True.
+        ValueError: predict by TensorRT need device == 'GPU'.
    """
-    if not use_gpu and not run_mode == 'fluid':
+    if device != 'GPU' and run_mode != 'fluid':
        raise ValueError(
-            "Predict by TensorRT mode: {}, expect use_gpu==True, but use_gpu == {}"
+            "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}"
-            .format(run_mode, use_gpu))
+            .format(run_mode, device))
    config = Config(
        os.path.join(model_dir, 'model.pdmodel'),
        os.path.join(model_dir, 'model.pdiparams'))
-    precision_map = {
+    if device == 'GPU':
-        'trt_int8': Config.Precision.Int8,
-        'trt_fp32': Config.Precision.Float32,
-        'trt_fp16': Config.Precision.Half
-    }
-    if use_gpu:
        # initial GPU memory(M), device ID
        config.enable_use_gpu(200, 0)
        # optimize graph and fuse op
        config.switch_ir_optim(True)
+    elif device == 'XPU':
+        config.enable_xpu(10 * 1024 * 1024)
    else:
        config.disable_gpu()
        config.set_cpu_math_library_num_threads(cpu_threads)
@@ -292,6 +289,11 @@ def load_predictor(model_dir,
                )
                pass
+    precision_map = {
+        'trt_int8': Config.Precision.Int8,
+        'trt_fp32': Config.Precision.Float32,
+        'trt_fp16': Config.Precision.Half
+    }
    if run_mode in precision_map.keys():
        config.enable_tensorrt_engine(
            workspace_size=1 << 10,
@@ -381,7 +383,7 @@ def main():
    detector = KeyPoint_Detector(
        pred_config,
        FLAGS.model_dir,
-        use_gpu=FLAGS.use_gpu,
+        device=FLAGS.device,
        run_mode=FLAGS.run_mode,
        trt_min_shape=FLAGS.trt_min_shape,
        trt_max_shape=FLAGS.trt_max_shape,
@@ -427,5 +429,9 @@ if __name__ == '__main__':
    parser = argsparser()
    FLAGS = parser.parse_args()
    print_arguments(FLAGS)
+    FLAGS.device = FLAGS.device.upper()
+    assert FLAGS.device in ['CPU', 'GPU', 'XPU'
+                            ], "device should be CPU, GPU or XPU"
+    assert not FLAGS.use_gpu, "use_gpu has been deprecated, please use --device"
    main()
--- a/deploy/python/topdown_unite_utils.py
+++ b/deploy/python/topdown_unite_utils.py
@@ -68,10 +68,11 @@ def argsparser():
        default='fluid',
        help="mode of running(fluid/trt_fp32/trt_fp16/trt_int8)")
    parser.add_argument(
-        "--use_gpu",
+        "--device",
-        type=ast.literal_eval,
+        type=str,
-        default=False,
+        default='cpu',
-        help="Whether to predict with GPU.")
+        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."
+    )
    parser.add_argument(
        "--run_benchmark",
        type=ast.literal_eval,

--- a/deploy/python/utils.py
+++ b/deploy/python/utils.py
@@ -59,11 +59,17 @@ def argsparser():
        type=str,
        default='fluid',
        help="mode of running(fluid/trt_fp32/trt_fp16/trt_int8)")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default='cpu',
+        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."
+    )
    parser.add_argument(
        "--use_gpu",
        type=ast.literal_eval,
        default=False,
-        help="Whether to predict with GPU.")
+        help="Deprecated, please use `--device`.")
    parser.add_argument(
        "--run_benchmark",
        type=ast.literal_eval,

--- a/static/deploy/cpp/docs/Jetson_build.md
+++ b/static/deploy/cpp/docs/Jetson_build.md
@@ -153,7 +153,7 @@ CUDNN_LIB=/usr/lib/aarch64-linux-gnu/
 | --image_file  | 要预测的图片文件路径 |
 | --video_path  | 要预测的视频文件路径 |
 | --camera_id | Option | 用来预测的摄像头ID，默认为-1（表示不使用摄像头预测）|
-| --use_gpu  | 是否使用 GPU 预测, 支持值为0或1(默认值为0)|
+| --device  | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`|
 | --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
 | --run_mode | 使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
 | --run_benchmark | 是否重复预测来进行benchmark测速 ｜
@@ -174,7 +174,7 @@ CUDNN_LIB=/usr/lib/aarch64-linux-gnu/
 `样例二`:
 ```shell
 #使用 `GPU`预测视频`/root/projects/videos/test.mp4`
-./main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --use_gpu=1
+./main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --device=GPU
 ```
 视频文件目前支持`.mp4`格式的预测，`可视化预测结果`会保存在当前目录下`output.mp4`文件中。

--- a/static/deploy/cpp/docs/linux_build.md
+++ b/static/deploy/cpp/docs/linux_build.md
@@ -100,7 +100,7 @@ make
 | --image_file  | 要预测的图片文件路径 |
 | --video_path  | 要预测的视频文件路径 |
 | --camera_id | Option | 用来预测的摄像头ID，默认为-1（表示不使用摄像头预测）|
-| --use_gpu  | 是否使用 GPU 预测, 支持值为0或1(默认值为0)|
+| --device  | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`|
 | --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
 | --run_mode | 使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
 | --run_benchmark | 是否重复预测来进行benchmark测速 ｜
@@ -121,6 +121,6 @@ make
 `样例二`:
 ```shell
 #使用 `GPU`预测视频`/root/projects/videos/test.mp4`
-./build/main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --use_gpu=1
+./build/main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --device=GPU
 ```
 视频文件目前支持`.mp4`格式的预测，`可视化预测结果`会保存在当前目录下`output.mp4`文件中。
--- a/static/deploy/cpp/docs/windows_vs2019_build.md
+++ b/static/deploy/cpp/docs/windows_vs2019_build.md
@@ -95,7 +95,7 @@ cd D:\projects\PaddleDetection\deploy\cpp\out\build\x64-Release
 | --image_file  | 要预测的图片文件路径 |
 | --video_path  | 要预测的视频文件路径 |
 | --camera_id | Option | 用来预测的摄像头ID，默认为-1（表示不使用摄像头预测）|
-| --use_gpu  | 是否使用 GPU 预测, 支持值为0或1(默认值为0)|
+| --device  | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`|
 | --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
 | --run_mode | 使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
 | --run_benchmark | 是否重复预测来进行benchmark测速 |
@@ -118,7 +118,7 @@ cd D:\projects\PaddleDetection\deploy\cpp\out\build\x64-Release
 `样例二`:
 ```shell
 #使用`GPU`测试视频 `D:\\videos\\test.mp4`  
-.\main --model_dir=D:\\models\\yolov3_darknet --video_path=D:\\videos\\test.mp4 --use_gpu=1
+.\main --model_dir=D:\\models\\yolov3_darknet --video_path=D:\\videos\\test.mp4 --device=GPU
 ```
 视频文件目前支持`.mp4`格式的预测，`可视化预测结果`会保存在当前目录下`output.mp4`文件中。

--- a/static/deploy/cpp/include/object_detector.h
+++ b/static/deploy/cpp/include/object_detector.h
@@ -56,20 +56,20 @@ cv::Mat VisualizeResult(const cv::Mat& img,
 class ObjectDetector {
 public:
  explicit ObjectDetector(const std::string& model_dir, 
-                          bool use_gpu=false,
+                          const std::string& device,
                          const std::string& run_mode="fluid",
                          const int gpu_id=0,
                          bool trt_calib_mode=false) {
    config_.load_config(model_dir);
    threshold_ = config_.draw_threshold_;
    preprocessor_.Init(config_.preprocess_info_, config_.arch_);
-    LoadModel(model_dir, use_gpu, config_.min_subgraph_size_, 1, run_mode, gpu_id, trt_calib_mode);
+    LoadModel(model_dir, device, config_.min_subgraph_size_, 1, run_mode, gpu_id, trt_calib_mode);
  }
  // Load Paddle inference model
  void LoadModel(
    const std::string& model_dir,
-    bool use_gpu,
+    const std::string& device,
    const int min_subgraph_size,
    const int batch_size = 1,
    const std::string& run_mode = "fluid",

--- a/static/deploy/cpp/src/main.cc
+++ b/static/deploy/cpp/src/main.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <algorithm>
 #ifdef _WIN32
 #include <direct.h>
@@ -35,7 +36,8 @@
 DEFINE_string(model_dir, "", "Path of inference model");
 DEFINE_string(image_file, "", "Path of input image");
 DEFINE_string(video_path, "", "Path of input video");
-DEFINE_bool(use_gpu, false, "Infering with GPU or CPU");
+DEFINE_bool(use_gpu, false, "Deprecated, please use `--device` to set the device you want to run.");
+DEFINE_string(device, "CPU", "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU.");
 DEFINE_bool(use_camera, false, "Use camera or not");
 DEFINE_string(run_mode, "fluid", "Mode of running(fluid/trt_fp32/trt_fp16)");
 DEFINE_int32(gpu_id, 0, "Device id of GPU to execute");
@@ -204,9 +206,18 @@ int main(int argc, char** argv) {
    std::cout << "run_mode should be 'fluid', 'trt_fp32', 'trt_fp16' or 'trt_int8'.";
    return -1;
  }
+  transform(FLAGS_device.begin(),FLAGS_device.end(),FLAGS_device.begin(),::toupper);
+  if (!(FLAGS_device == "CPU" || FLAGS_device == "GPU" || FLAGS_device == "XPU")) {
+    std::cout << "device should be 'CPU', 'GPU' or 'XPU'.";
+    return -1;
+  }
+  if (FLAGS_use_gpu) {
+    std::cout << "Deprecated, please use `--device` to set the device you want to run.";
+    return -1;
+  }
  // Load model and create a object detector
-  PaddleDetection::ObjectDetector det(FLAGS_model_dir, FLAGS_use_gpu,
+  PaddleDetection::ObjectDetector det(FLAGS_model_dir, FLAGS_device,
    FLAGS_run_mode, FLAGS_gpu_id, FLAGS_trt_calib_mode);
  // Do inference on input video or image
  if (!FLAGS_video_path.empty() || FLAGS_use_camera) {

--- a/static/deploy/cpp/src/object_detector.cc
+++ b/static/deploy/cpp/src/object_detector.cc
@@ -21,7 +21,7 @@ namespace PaddleDetection {
 // Load Model and create model predictor
 void ObjectDetector::LoadModel(const std::string& model_dir,
-                               bool use_gpu,
+                               const std::string& device,
                               const int min_subgraph_size,
                               const int batch_size,
                               const std::string& run_mode,
@@ -31,7 +31,7 @@ void ObjectDetector::LoadModel(const std::string& model_dir,
  std::string prog_file = model_dir + OS_PATH_SEP + "__model__";
  std::string params_file = model_dir + OS_PATH_SEP + "__params__";
  config.SetModel(prog_file, params_file);
-  if (use_gpu) {
+  if (device == "GPU") {
    config.EnableUseGpu(100, gpu_id);
    config.SwitchIrOptim(true);
    if (run_mode != "fluid") {
@@ -51,6 +51,8 @@ void ObjectDetector::LoadModel(const std::string& model_dir,
          false,
          trt_calib_mode);
   }
+  } else if (device == "XPU"){
+    config.EnableXpu(10*1024*1024);
  } else {
    config.DisableGpu();
  }

--- a/static/deploy/python/README.md
+++ b/static/deploy/python/README.md
@@ -45,7 +45,7 @@ python deploy/python/infer.py --model_dir=/path/to/models --image_file=/path/to/
 | --image_file | Option |需要预测的图片 |
 | --video_file | Option |需要预测的视频 |
 | --camera_id | Option | 用来预测的摄像头ID，默认为-1(表示不使用摄像头预测，可设置为：0 - (摄像头数目-1) )，预测过程中在可视化界面按`q`退出输出预测结果到：output/output.mp4|
-| --use_gpu |No|是否GPU，默认为False|
+| --device | Option | 运行时的设备，可选择`CPU/GPU`，默认为`CPU`|
 | --run_mode |No|使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
 | --threshold |No|预测得分的阈值，默认为0.5|
 | --output_dir |No|可视化结果保存的根目录，默认为output/|

--- a/static/deploy/python/infer.py
+++ b/static/deploy/python/infer.py
@@ -55,7 +55,7 @@ class Detector(object):
    Args:
        config (object): config of model, defined by `Config(model_dir)`
        model_dir (str): root path of __model__, __params__ and infer_cfg.yml
-        use_gpu (bool): whether use gpu
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
        run_mode (str): mode of running(fluid/trt_fp32/trt_fp16)
        threshold (float): threshold to reserve the result for output.
    """
@@ -63,20 +63,20 @@ class Detector(object):
    def __init__(self,
                 config,
                 model_dir,
-                 use_gpu=False,
+                 device='CPU',
                 run_mode='fluid',
                 threshold=0.5,
                 trt_calib_mode=False):
        self.config = config
        if self.config.use_python_inference:
            self.executor, self.program, self.fecth_targets = load_executor(
-                model_dir, use_gpu=use_gpu)
+                model_dir, device=device)
        else:
            self.predictor = load_predictor(
                model_dir,
                run_mode=run_mode,
                min_subgraph_size=self.config.min_subgraph_size,
-                use_gpu=use_gpu,
+                device=device,
                trt_calib_mode=trt_calib_mode)
    def preprocess(self, im):
@@ -221,14 +221,14 @@ class DetectorSOLOv2(Detector):
    def __init__(self,
                 config,
                 model_dir,
-                 use_gpu=False,
+                 device='CPU',
                 run_mode='fluid',
                 threshold=0.5,
                 trt_calib_mode=False):
        super(DetectorSOLOv2, self).__init__(
            config=config,
            model_dir=model_dir,
-            use_gpu=use_gpu,
+            device=device,
            run_mode=run_mode,
            threshold=threshold,
            trt_calib_mode=trt_calib_mode)
@@ -382,24 +382,24 @@ class Config():
 def load_predictor(model_dir,
                   run_mode='fluid',
                   batch_size=1,
-                   use_gpu=False,
+                   device='CPU',
                   min_subgraph_size=3,
                   trt_calib_mode=False):
    """set AnalysisConfig, generate AnalysisPredictor
    Args:
        model_dir (str): root path of __model__ and __params__
-        use_gpu (bool): whether use gpu
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
            calibration, trt_calib_mode need to set True
    Returns:
        predictor (PaddlePredictor): AnalysisPredictor
    Raises:
-        ValueError: predict by TensorRT need use_gpu == True.
+        ValueError: predict by TensorRT need device == GPU.
    """
-    if not use_gpu and not run_mode == 'fluid':
+    if device != 'GPU' and not run_mode == 'fluid':
        raise ValueError(
-            "Predict by TensorRT mode: {}, expect use_gpu==True, but use_gpu == {}"
+            "Predict by TensorRT mode: {}, expect device==GPU, but device == {}"
-            .format(run_mode, use_gpu))
+            .format(run_mode, device))
    precision_map = {
        'trt_int8': fluid.core.AnalysisConfig.Precision.Int8,
        'trt_fp32': fluid.core.AnalysisConfig.Precision.Float32,
@@ -408,11 +408,13 @@ def load_predictor(model_dir,
    config = fluid.core.AnalysisConfig(
        os.path.join(model_dir, '__model__'),
        os.path.join(model_dir, '__params__'))
-    if use_gpu:
+    if device == 'GPU':
        # initial GPU memory(M), device ID
        config.enable_use_gpu(100, 0)
        # optimize graph and fuse op
        config.switch_ir_optim(True)
+    elif device == 'XPU':
+        config.enable_xpu(10 * 1024 * 1024)
    else:
        config.disable_gpu()
@@ -435,8 +437,8 @@ def load_predictor(model_dir,
    return predictor
-def load_executor(model_dir, use_gpu=False):
+def load_executor(model_dir, device='CPU'):
-    if use_gpu:
+    if device == 'GPU':
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()
@@ -539,14 +541,14 @@ def main():
    detector = Detector(
        config,
        FLAGS.model_dir,
-        use_gpu=FLAGS.use_gpu,
+        device=FLAGS.device,
        run_mode=FLAGS.run_mode,
        trt_calib_mode=FLAGS.trt_calib_mode)
    if config.arch == 'SOLOv2':
        detector = DetectorSOLOv2(
            config,
            FLAGS.model_dir,
-            use_gpu=FLAGS.use_gpu,
+            device=FLAGS.device,
            run_mode=FLAGS.run_mode,
            trt_calib_mode=FLAGS.trt_calib_mode)
    # predict from image
@@ -584,11 +586,18 @@ if __name__ == '__main__':
        type=str,
        default='fluid',
        help="mode of running(fluid/trt_fp32/trt_fp16/trt_int8)")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default='cpu',
+        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."
+    )
    parser.add_argument(
        "--use_gpu",
        type=ast.literal_eval,
        default=False,
-        help="Whether to predict with GPU.")
+        help="Deprecated, please use `--device` to set the device you want to run."
+    )
    parser.add_argument(
        "--run_benchmark",
        type=ast.literal_eval,
@@ -612,5 +621,9 @@ if __name__ == '__main__':
    print_arguments(FLAGS)
    if FLAGS.image_file != '' and FLAGS.video_file != '':
        assert "Cannot predict image and video at the same time"
+    FLAGS.device = FLAGS.device.upper()
+    assert FLAGS.device in ['CPU', 'GPU', 'XPU'
+                            ], "device should be CPU, GPU or XPU"
+    assert not FLAGS.use_gpu, "use_gpu has been deprecated, please use --device"
    main()