diff --git a/modules/image/face_detection/pyramidbox_lite_mobile/module.py b/modules/image/face_detection/pyramidbox_lite_mobile/module.py
index 468aff860de23e9466d751217ab777f1895e5b96..1f688e9227bed05273a90627dcf3f4ff74e192c8 100644
--- a/modules/image/face_detection/pyramidbox_lite_mobile/module.py
+++ b/modules/image/face_detection/pyramidbox_lite_mobile/module.py
@@ -9,7 +9,10 @@ import os
 import numpy as np
 import paddle.fluid as fluid
 import paddlehub as hub
-from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
+
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
 from paddlehub.module.module import moduleinfo, runnable, serving
 
 from pyramidbox_lite_mobile.data_feed import reader
@@ -29,26 +32,53 @@ class PyramidBoxLiteMobile(hub.Module):
         self._set_config()
         self.processor = self
 
+    def _get_device_id(self, places):
+        try:
+            places = os.environ[places]
+            id = int(places)
+        except:
+            id = -1
+        return id
+
     def _set_config(self):
         """
         predictor config setting
         """
-        cpu_config = AnalysisConfig(self.default_pretrained_model_path)
+
+        # create default cpu predictor
+        cpu_config = Config(self.default_pretrained_model_path)
         cpu_config.disable_glog_info()
         cpu_config.disable_gpu()
-        self.cpu_predictor = create_paddle_predictor(cpu_config)
-
-        try:
-            _places = os.environ["CUDA_VISIBLE_DEVICES"]
-            int(_places[0])
-            use_gpu = True
-        except:
-            use_gpu = False
-        if use_gpu:
-            gpu_config = AnalysisConfig(self.default_pretrained_model_path)
+        self.cpu_predictor = create_predictor(cpu_config)
+
+        # create predictors using various types of devices
+
+        # npu
+        npu_id = self._get_device_id("FLAGS_selected_npus")
+        if npu_id != -1:
+            # use npu
+            npu_config = Config(self.default_pretrained_model_path)
+            npu_config.disable_glog_info()
+            npu_config.enable_npu(device_id=npu_id)
+            self.npu_predictor = create_predictor(npu_config)
+
+        # gpu
+        gpu_id = self._get_device_id("CUDA_VISIBLE_DEVICES")
+        if gpu_id != -1:
+            # use gpu
+            gpu_config = Config(self.default_pretrained_model_path)
             gpu_config.disable_glog_info()
-            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)
-            self.gpu_predictor = create_paddle_predictor(gpu_config)
+            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=gpu_id)
+            self.gpu_predictor = create_predictor(gpu_config)
+
+        # xpu
+        xpu_id = self._get_device_id("XPU_VISIBLE_DEVICES")
+        if xpu_id != -1:
+            # use xpu
+            xpu_config = Config(self.default_pretrained_model_path)
+            xpu_config.disable_glog_info()
+            xpu_config.enable_xpu(100)
+            self.xpu_predictor = create_predictor(xpu_config)
 
     def face_detection(self,
                        images=None,
@@ -58,7 +88,8 @@ class PyramidBoxLiteMobile(hub.Module):
                        output_dir='detection_result',
                        visualization=False,
                        shrink=0.5,
-                       confs_threshold=0.6):
+                       confs_threshold=0.6,
+                       use_device=None):
         """
         API for face detection.
 
@@ -70,18 +101,29 @@ class PyramidBoxLiteMobile(hub.Module):
             visualization (bool): Whether to save image or not.
             shrink (float): parameter to control the resize scale in preprocess.
             confs_threshold (float): confidence threshold.
+            use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
 
         Returns:
             res (list[dict]): The result of face detection and save path of images.
         """
-        if use_gpu:
-            try:
-                _places = os.environ["CUDA_VISIBLE_DEVICES"]
-                int(_places[0])
-            except:
-                raise RuntimeError(
-                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id."
-                )
+        # real predictor to use
+        if use_device is not None:
+            if use_device == "cpu":
+                predictor = self.cpu_predictor
+            elif use_device == "xpu":
+                predictor = self.xpu_predictor
+            elif use_device == "npu":
+                predictor = self.npu_predictor
+            elif use_device == "gpu":
+                predictor = self.gpu_predictor
+            else:
+                raise Exception("Unsupported device: " + use_device)
+        else:
+            # use_device is not set, therefore follow use_gpu
+            if use_gpu:
+                predictor = self.gpu_predictor
+            else:
+                predictor = self.cpu_predictor
 
         # compatibility with older versions
         if data:
@@ -97,11 +139,19 @@ class PyramidBoxLiteMobile(hub.Module):
         res = list()
         # process one by one
         for element in reader(images, paths, shrink):
-            image = np.expand_dims(element['image'], axis=0).astype('float32')
-            image_tensor = PaddleTensor(image.copy())
-            data_out = self.gpu_predictor.run([image_tensor]) if use_gpu else self.cpu_predictor.run([image_tensor])
+            batch_image = np.expand_dims(element['image'], axis=0).astype('float32')
+
+            input_names = predictor.get_input_names()
+            input_tensor = predictor.get_input_handle(input_names[0])
+            input_tensor.reshape(batch_image.shape)
+            input_tensor.copy_from_cpu(batch_image.copy())
+            predictor.run()
+            output_names = predictor.get_output_names()
+            output_handle = predictor.get_output_handle(output_names[0])
+            predictor_output = output_handle.copy_to_cpu()
+
             out = postprocess(
-                data_out=data_out[0].as_ndarray(),
+                data_out=predictor_output,
                 org_im=element['org_im'],
                 org_im_path=element['org_im_path'],
                 image_width=element['image_width'],
@@ -166,7 +216,8 @@ class PyramidBoxLiteMobile(hub.Module):
             output_dir=args.output_dir,
             visualization=args.visualization,
             shrink=args.shrink,
-            confs_threshold=args.confs_threshold)
+            confs_threshold=args.confs_threshold,
+            use_device=args.use_device)
         return results
 
     def add_module_config_arg(self):
@@ -179,6 +230,10 @@ class PyramidBoxLiteMobile(hub.Module):
             '--output_dir', type=str, default='detection_result', help="The directory to save output images.")
         self.arg_config_group.add_argument(
             '--visualization', type=ast.literal_eval, default=False, help="whether to save output as images.")
+        self.arg_config_group.add_argument(
+            '--use_device',
+            choices=["cpu", "gpu", "xpu", "npu"],
+            help="use cpu, gpu, xpu or npu. overwrites use_gpu flag.")
 
     def add_module_input_arg(self):
         """
diff --git a/modules/image/face_detection/pyramidbox_lite_mobile_mask/data_feed.py b/modules/image/face_detection/pyramidbox_lite_mobile_mask/data_feed.py
index 608eb0d7d8bea89e3a41f785ddefe519ce538af6..1e9503eca241a72a95e70252c3bfed3972f80454 100644
--- a/modules/image/face_detection/pyramidbox_lite_mobile_mask/data_feed.py
+++ b/modules/image/face_detection/pyramidbox_lite_mobile_mask/data_feed.py
@@ -101,7 +101,7 @@ def process_image(org_im, face):
     return image_in
 
 
-def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu, use_multi_scale):
+def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu, use_multi_scale, use_device=None):
     """
     Preprocess to yield image.
 
@@ -113,6 +113,7 @@ def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu, use_m
         paths (list[str]): paths to images.
         use_gpu (bool): whether to use gpu in face_detector.
         use_multi_scale (bool): whether to enable multi-scale face detection.
+        use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
     Yield:
         element (collections.OrderedDict): info of original image, preprocessed image, contains 3 keys:
             org_im (numpy.ndarray) : original image.
@@ -149,7 +150,8 @@ def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu, use_m
                     use_gpu=use_gpu,
                     visualization=False,
                     shrink=scale,
-                    confs_threshold=confs_threshold)
+                    confs_threshold=confs_threshold,
+                    use_device=use_device)
 
                 _s = list()
                 for _face in _detect_res[0]['data']:
@@ -172,7 +174,8 @@ def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu, use_m
                 use_gpu=use_gpu,
                 visualization=False,
                 shrink=shrink,
-                confs_threshold=confs_threshold)
+                confs_threshold=confs_threshold,
+                use_device=use_device)
             detect_faces = _detect_res[0]['data']
 
         element['preprocessed'] = list()
diff --git a/modules/image/face_detection/pyramidbox_lite_mobile_mask/module.py b/modules/image/face_detection/pyramidbox_lite_mobile_mask/module.py
index 73d3c68512255ad024b19800d327f3d744d6da77..caa99a9daa77576f8bb860d6084961d29fe4cef5 100644
--- a/modules/image/face_detection/pyramidbox_lite_mobile_mask/module.py
+++ b/modules/image/face_detection/pyramidbox_lite_mobile_mask/module.py
@@ -9,7 +9,10 @@ import os
 import numpy as np
 import paddle.fluid as fluid
 import paddlehub as hub
-from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
+
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
 from paddlehub.module.module import moduleinfo, runnable, serving
 
 from pyramidbox_lite_mobile_mask.data_feed import reader
@@ -38,26 +41,53 @@ class PyramidBoxLiteMobileMask(hub.Module):
         self._set_config()
         self.processor = self
 
+    def _get_device_id(self, places):
+        try:
+            places = os.environ[places]
+            id = int(places)
+        except:
+            id = -1
+        return id
+
     def _set_config(self):
         """
         predictor config setting
         """
-        cpu_config = AnalysisConfig(self.default_pretrained_model_path)
+
+        # create default cpu predictor
+        cpu_config = Config(self.default_pretrained_model_path)
         cpu_config.disable_glog_info()
         cpu_config.disable_gpu()
-        self.cpu_predictor = create_paddle_predictor(cpu_config)
+        self.cpu_predictor = create_predictor(cpu_config)
 
-        try:
-            _places = os.environ["CUDA_VISIBLE_DEVICES"]
-            int(_places[0])
-            use_gpu = True
-        except:
-            use_gpu = False
-        if use_gpu:
-            gpu_config = AnalysisConfig(self.default_pretrained_model_path)
+        # create predictors using various types of devices
+
+        # npu
+        npu_id = self._get_device_id("FLAGS_selected_npus")
+        if npu_id != -1:
+            # use npu
+            npu_config = Config(self.default_pretrained_model_path)
+            npu_config.disable_glog_info()
+            npu_config.enable_npu(device_id=npu_id)
+            self.npu_predictor = create_predictor(npu_config)
+
+        # gpu
+        gpu_id = self._get_device_id("CUDA_VISIBLE_DEVICES")
+        if gpu_id != -1:
+            # use gpu
+            gpu_config = Config(self.default_pretrained_model_path)
             gpu_config.disable_glog_info()
-            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)
-            self.gpu_predictor = create_paddle_predictor(gpu_config)
+            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=gpu_id)
+            self.gpu_predictor = create_predictor(gpu_config)
+
+        # xpu
+        xpu_id = self._get_device_id("XPU_VISIBLE_DEVICES")
+        if xpu_id != -1:
+            # use xpu
+            xpu_config = Config(self.default_pretrained_model_path)
+            xpu_config.disable_glog_info()
+            xpu_config.enable_xpu(100)
+            self.xpu_predictor = create_predictor(xpu_config)
 
     def set_face_detector_module(self, face_detector_module):
         """
@@ -80,7 +110,8 @@ class PyramidBoxLiteMobileMask(hub.Module):
                        output_dir='detection_result',
                        use_multi_scale=False,
                        shrink=0.5,
-                       confs_threshold=0.6):
+                       confs_threshold=0.6,
+                       use_device=None):
         """
         API for face detection.
 
@@ -96,18 +127,29 @@ class PyramidBoxLiteMobileMask(hub.Module):
                 it reduce the prediction speed for the increase model calculation.
             shrink (float): parameter to control the resize scale in preprocess.
             confs_threshold (float): confidence threshold.
+            use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
 
         Returns:
             res (list[dict]): The result of face detection and save path of images.
         """
-        if use_gpu:
-            try:
-                _places = os.environ["CUDA_VISIBLE_DEVICES"]
-                int(_places[0])
-            except:
-                raise RuntimeError(
-                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id."
-                )
+        # real predictor to use
+        if use_device is not None:
+            if use_device == "cpu":
+                predictor = self.cpu_predictor
+            elif use_device == "xpu":
+                predictor = self.xpu_predictor
+            elif use_device == "npu":
+                predictor = self.npu_predictor
+            elif use_device == "gpu":
+                predictor = self.gpu_predictor
+            else:
+                raise Exception("Unsupported device: " + use_device)
+        else:
+            # use_device is not set, therefore follow use_gpu
+            if use_gpu:
+                predictor = self.gpu_predictor
+            else:
+                predictor = self.cpu_predictor
 
         # compatibility with older versions
         if data:
@@ -122,7 +164,8 @@ class PyramidBoxLiteMobileMask(hub.Module):
 
         # get all data
         all_element = list()
-        for yield_data in reader(self.face_detector, shrink, confs_threshold, images, paths, use_gpu, use_multi_scale):
+        for yield_data in reader(self.face_detector, shrink, confs_threshold, images, paths, use_gpu, use_multi_scale,
+                                 use_device):
             all_element.append(yield_data)
 
         image_list = list()
@@ -145,13 +188,18 @@ class PyramidBoxLiteMobileMask(hub.Module):
                 except:
                     pass
 
-            image_arr = np.squeeze(np.array(batch_data), axis=1)
-            image_tensor = PaddleTensor(image_arr.copy())
-            data_out = self.gpu_predictor.run([image_tensor]) if use_gpu else self.cpu_predictor.run([image_tensor])
-            # len(data_out) == 1
-            # data_out[0].as_ndarray().shape == (-1, 2)
-            data_out = data_out[0].as_ndarray()
-            predict_out = np.concatenate((predict_out, data_out))
+            batch_image = np.squeeze(np.array(batch_data), axis=1)
+
+            input_names = predictor.get_input_names()
+            input_tensor = predictor.get_input_handle(input_names[0])
+            input_tensor.reshape(batch_image.shape)
+            input_tensor.copy_from_cpu(batch_image.copy())
+            predictor.run()
+            output_names = predictor.get_output_names()
+            output_handle = predictor.get_output_handle(output_names[0])
+            predictor_output = output_handle.copy_to_cpu()
+
+            predict_out = np.concatenate((predict_out, predictor_output))
 
         predict_out = predict_out[1:]
         # postprocess one by one
@@ -229,7 +277,8 @@ class PyramidBoxLiteMobileMask(hub.Module):
             output_dir=args.output_dir,
             visualization=args.visualization,
             shrink=args.shrink,
-            confs_threshold=args.confs_threshold)
+            confs_threshold=args.confs_threshold,
+            use_device=args.use_device)
         return results
 
     def add_module_config_arg(self):
@@ -242,6 +291,10 @@ class PyramidBoxLiteMobileMask(hub.Module):
             '--output_dir', type=str, default='detection_result', help="The directory to save output images.")
         self.arg_config_group.add_argument(
             '--visualization', type=ast.literal_eval, default=False, help="whether to save output as images.")
+        self.arg_config_group.add_argument(
+            '--use_device',
+            choices=["cpu", "gpu", "xpu", "npu"],
+            help="use cpu, gpu, xpu or npu. overwrites use_gpu flag.")
 
     def add_module_input_arg(self):
         """
diff --git a/modules/image/face_detection/pyramidbox_lite_server/module.py b/modules/image/face_detection/pyramidbox_lite_server/module.py
index 617baba6cce0ef6cc4861a95369fd1c80f9b8a47..8484ad009ed79d33c0c2a1a4a31636cd73532e7e 100644
--- a/modules/image/face_detection/pyramidbox_lite_server/module.py
+++ b/modules/image/face_detection/pyramidbox_lite_server/module.py
@@ -9,7 +9,10 @@ import os
 import numpy as np
 import paddle.fluid as fluid
 import paddlehub as hub
-from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
+
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
 from paddlehub.module.module import moduleinfo, runnable, serving
 
 from pyramidbox_lite_server.data_feed import reader
@@ -29,26 +32,53 @@ class PyramidBoxLiteServer(hub.Module):
         self._set_config()
         self.processor = self
 
+    def _get_device_id(self, places):
+        try:
+            places = os.environ[places]
+            id = int(places)
+        except:
+            id = -1
+        return id
+
     def _set_config(self):
         """
         predictor config setting
         """
-        cpu_config = AnalysisConfig(self.default_pretrained_model_path)
+
+        # create default cpu predictor
+        cpu_config = Config(self.default_pretrained_model_path)
         cpu_config.disable_glog_info()
         cpu_config.disable_gpu()
-        self.cpu_predictor = create_paddle_predictor(cpu_config)
-
-        try:
-            _places = os.environ["CUDA_VISIBLE_DEVICES"]
-            int(_places[0])
-            use_gpu = True
-        except:
-            use_gpu = False
-        if use_gpu:
-            gpu_config = AnalysisConfig(self.default_pretrained_model_path)
+        self.cpu_predictor = create_predictor(cpu_config)
+
+        # create predictors using various types of devices
+
+        # npu
+        npu_id = self._get_device_id("FLAGS_selected_npus")
+        if npu_id != -1:
+            # use npu
+            npu_config = Config(self.default_pretrained_model_path)
+            npu_config.disable_glog_info()
+            npu_config.enable_npu(device_id=npu_id)
+            self.npu_predictor = create_predictor(npu_config)
+
+        # gpu
+        gpu_id = self._get_device_id("CUDA_VISIBLE_DEVICES")
+        if gpu_id != -1:
+            # use gpu
+            gpu_config = Config(self.default_pretrained_model_path)
             gpu_config.disable_glog_info()
-            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)
-            self.gpu_predictor = create_paddle_predictor(gpu_config)
+            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=gpu_id)
+            self.gpu_predictor = create_predictor(gpu_config)
+
+        # xpu
+        xpu_id = self._get_device_id("XPU_VISIBLE_DEVICES")
+        if xpu_id != -1:
+            # use xpu
+            xpu_config = Config(self.default_pretrained_model_path)
+            xpu_config.disable_glog_info()
+            xpu_config.enable_xpu(100)
+            self.xpu_predictor = create_predictor(xpu_config)
 
     def face_detection(self,
                        images=None,
@@ -58,7 +88,8 @@ class PyramidBoxLiteServer(hub.Module):
                        output_dir='detection_result',
                        visualization=False,
                        shrink=0.5,
-                       confs_threshold=0.6):
+                       confs_threshold=0.6,
+                       use_device=None):
         """
         API for face detection.
 
@@ -70,18 +101,29 @@ class PyramidBoxLiteServer(hub.Module):
             visualization (bool): Whether to save image or not.
             shrink (float): parameter to control the resize scale in preprocess.
             confs_threshold (float): confidence threshold.
+            use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
 
         Returns:
             res (list[dict]): The result of face detection and save path of images.
         """
-        if use_gpu:
-            try:
-                _places = os.environ["CUDA_VISIBLE_DEVICES"]
-                int(_places[0])
-            except:
-                raise RuntimeError(
-                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id."
-                )
+        # real predictor to use
+        if use_device is not None:
+            if use_device == "cpu":
+                predictor = self.cpu_predictor
+            elif use_device == "xpu":
+                predictor = self.xpu_predictor
+            elif use_device == "npu":
+                predictor = self.npu_predictor
+            elif use_device == "gpu":
+                predictor = self.gpu_predictor
+            else:
+                raise Exception("Unsupported device: " + use_device)
+        else:
+            # use_device is not set, therefore follow use_gpu
+            if use_gpu:
+                predictor = self.gpu_predictor
+            else:
+                predictor = self.cpu_predictor
 
         # compatibility with older versions
         if data:
@@ -97,11 +139,19 @@ class PyramidBoxLiteServer(hub.Module):
         res = list()
         # process one by one
         for element in reader(images, paths, shrink):
-            image = np.expand_dims(element['image'], axis=0).astype('float32')
-            image_tensor = PaddleTensor(image.copy())
-            data_out = self.gpu_predictor.run([image_tensor]) if use_gpu else self.cpu_predictor.run([image_tensor])
+            batch_image = np.expand_dims(element['image'], axis=0).astype('float32')
+
+            input_names = predictor.get_input_names()
+            input_tensor = predictor.get_input_handle(input_names[0])
+            input_tensor.reshape(batch_image.shape)
+            input_tensor.copy_from_cpu(batch_image.copy())
+            predictor.run()
+            output_names = predictor.get_output_names()
+            output_handle = predictor.get_output_handle(output_names[0])
+            predictor_output = output_handle.copy_to_cpu()
+
             out = postprocess(
-                data_out=data_out[0].as_ndarray(),
+                data_out=predictor_output,
                 org_im=element['org_im'],
                 org_im_path=element['org_im_path'],
                 image_width=element['image_width'],
@@ -163,7 +213,8 @@ class PyramidBoxLiteServer(hub.Module):
             output_dir=args.output_dir,
             visualization=args.visualization,
             shrink=args.shrink,
-            confs_threshold=args.confs_threshold)
+            confs_threshold=args.confs_threshold,
+            use_device=args.use_device)
         return results
 
     def add_module_config_arg(self):
@@ -176,6 +227,10 @@ class PyramidBoxLiteServer(hub.Module):
             '--output_dir', type=str, default='detection_result', help="The directory to save output images.")
         self.arg_config_group.add_argument(
             '--visualization', type=ast.literal_eval, default=False, help="whether to save output as images.")
+        self.arg_config_group.add_argument(
+            '--use_device',
+            choices=["cpu", "gpu", "xpu", "npu"],
+            help="use cpu, gpu, xpu or npu. overwrites use_gpu flag.")
 
     def add_module_input_arg(self):
         """
diff --git a/modules/image/face_detection/pyramidbox_lite_server_mask/data_feed.py b/modules/image/face_detection/pyramidbox_lite_server_mask/data_feed.py
index 7d13164821006c162e8f03102e30143313a25ca6..54021aced364c266002b89f49dcc513597169c33 100644
--- a/modules/image/face_detection/pyramidbox_lite_server_mask/data_feed.py
+++ b/modules/image/face_detection/pyramidbox_lite_server_mask/data_feed.py
@@ -43,8 +43,7 @@ def bbox_vote(det):
         det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
         max_score = np.max(det_accu[:, 4])
         det_accu_sum = np.zeros((1, 5))
-        det_accu_sum[:, 0:4] = np.sum(
-            det_accu[:, 0:4], axis=0) / np.sum(det_accu[:, -1:])
+        det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], axis=0) / np.sum(det_accu[:, -1:])
         det_accu_sum[:, 4] = max_score
         try:
             dets = np.row_stack((dets, det_accu_sum))
@@ -54,38 +53,26 @@ def bbox_vote(det):
     return dets
 
 
-def crop(image,
-         pts,
-         shift=0,
-         scale=1.5,
-         rotate=0,
-         res_width=128,
-         res_height=128):
+def crop(image, pts, shift=0, scale=1.5, rotate=0, res_width=128, res_height=128):
     res = (res_width, res_height)
     idx1 = 0
     idx2 = 1
     # angle
     alpha = 0
-    if pts[idx2, 0] != -1 and pts[idx2, 1] != -1 and pts[idx1, 0] != -1 and pts[
-            idx1, 1] != -1:
-        alpha = math.atan2(pts[idx2, 1] - pts[idx1, 1],
-                           pts[idx2, 0] - pts[idx1, 0]) * 180 / math.pi
+    if pts[idx2, 0] != -1 and pts[idx2, 1] != -1 and pts[idx1, 0] != -1 and pts[idx1, 1] != -1:
+        alpha = math.atan2(pts[idx2, 1] - pts[idx1, 1], pts[idx2, 0] - pts[idx1, 0]) * 180 / math.pi
     pts[pts == -1] = np.inf
     coord_min = np.min(pts, 0)
     pts[pts == np.inf] = -1
     coord_max = np.max(pts, 0)
     # coordinates of center point
-    c = np.array([
-        coord_max[0] - (coord_max[0] - coord_min[0]) / 2,
-        coord_max[1] - (coord_max[1] - coord_min[1]) / 2
-    ])  # center
-    max_wh = max((coord_max[0] - coord_min[0]) / 2,
-                 (coord_max[1] - coord_min[1]) / 2)
+    c = np.array([coord_max[0] - (coord_max[0] - coord_min[0]) / 2,
+                  coord_max[1] - (coord_max[1] - coord_min[1]) / 2])  # center
+    max_wh = max((coord_max[0] - coord_min[0]) / 2, (coord_max[1] - coord_min[1]) / 2)
     # Shift the center point, rot add eyes angle
     c = c + shift * max_wh
     rotate = rotate + alpha
-    M = cv2.getRotationMatrix2D((c[0], c[1]), rotate,
-                                res[0] / (2 * max_wh * scale))
+    M = cv2.getRotationMatrix2D((c[0], c[1]), rotate, res[0] / (2 * max_wh * scale))
     M[0, 2] = M[0, 2] - (c[0] - res[0] / 2.0)
     M[1, 2] = M[1, 2] - (c[1] - res[0] / 2.0)
     image_out = cv2.warpAffine(image, M, res)
@@ -97,27 +84,24 @@ def color_normalize(image, mean, std=None):
         image = np.repeat(image, axis=2)
     h, w, c = image.shape
     image = np.transpose(image, (2, 0, 1))
-    image = np.subtract(image.reshape(c, -1), mean[:, np.newaxis]).reshape(
-        -1, h, w)
+    image = np.subtract(image.reshape(c, -1), mean[:, np.newaxis]).reshape(-1, h, w)
     image = np.transpose(image, (1, 2, 0))
     return image
 
 
 def process_image(org_im, face):
     pts = np.array([
-        face['left'], face['top'], face['right'], face['top'], face['left'],
-        face['bottom'], face['right'], face['bottom']
+        face['left'], face['top'], face['right'], face['top'], face['left'], face['bottom'], face['right'],
+        face['bottom']
     ]).reshape(4, 2).astype(np.float32)
     image_in, M = crop(org_im, pts)
     image_in = image_in / 256.0
     image_in = color_normalize(image_in, mean=np.array([0.5, 0.5, 0.5]))
-    image_in = image_in.astype(np.float32).transpose([2, 0, 1]).reshape(
-        -1, 3, 128, 128)
+    image_in = image_in.astype(np.float32).transpose([2, 0, 1]).reshape(-1, 3, 128, 128)
     return image_in
 
 
-def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu,
-           use_multi_scale):
+def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu, use_multi_scale, use_device=None):
     """
     Preprocess to yield image.
 
@@ -129,6 +113,7 @@ def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu,
         paths (list[str]): paths to images.
         use_gpu (bool): whether to use gpu in face_detector.
         use_multi_scale (bool): whether to enable multi-scale face detection.
+        use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
     Yield:
         element (collections.OrderedDict): info of original image, preprocessed image, contains 3 keys:
             org_im (numpy.ndarray) : original image.
@@ -142,8 +127,7 @@ def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu,
         assert type(paths) is list, "paths should be a list."
         for im_path in paths:
             each = OrderedDict()
-            assert os.path.isfile(
-                im_path), "The {} isn't a valid file path.".format(im_path)
+            assert os.path.isfile(im_path), "The {} isn't a valid file path.".format(im_path)
             im = cv2.imread(im_path)
             each['org_im'] = im
             each['org_im_path'] = im_path
@@ -153,8 +137,7 @@ def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu,
         for im in images:
             each = OrderedDict()
             each['org_im'] = im
-            each['org_im_path'] = 'ndarray_time={}'.format(
-                round(time.time(), 6) * 1e6)
+            each['org_im_path'] = 'ndarray_time={}'.format(round(time.time(), 6) * 1e6)
             component.append(each)
 
     for element in component:
@@ -167,31 +150,24 @@ def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu,
                     use_gpu=use_gpu,
                     visualization=False,
                     shrink=scale,
-                    confs_threshold=confs_threshold)
+                    confs_threshold=confs_threshold,
+                    use_device=use_device)
 
                 _s = list()
                 for _face in _detect_res[0]['data']:
-                    _face_list = [
-                        _face['left'], _face['top'], _face['right'],
-                        _face['bottom'], _face['confidence']
-                    ]
+                    _face_list = [_face['left'], _face['top'], _face['right'], _face['bottom'], _face['confidence']]
                     _s.append(_face_list)
 
                 if _s:
                     scale_res.append(np.array(_s))
+
             if scale_res:
-            	scale_res = np.row_stack(scale_res)
-            	scale_res = bbox_vote(scale_res)
-            	keep_index = np.where(scale_res[:, 4] >= confs_threshold)[0]
-            	scale_res = scale_res[keep_index, :]
-            	for data in scale_res:
-                    face = {
-                    'left': data[0],
-                    'top': data[1],
-                    'right': data[2],
-                    'bottom': data[3],
-                    'confidence': data[4]
-                    }
+                scale_res = np.row_stack(scale_res)
+                scale_res = bbox_vote(scale_res)
+                keep_index = np.where(scale_res[:, 4] >= confs_threshold)[0]
+                scale_res = scale_res[keep_index, :]
+                for data in scale_res:
+                    face = {'left': data[0], 'top': data[1], 'right': data[2], 'bottom': data[3], 'confidence': data[4]}
                     detect_faces.append(face)
             else:
                 detect_faces = []
@@ -201,7 +177,8 @@ def reader(face_detector, shrink, confs_threshold, images, paths, use_gpu,
                 use_gpu=use_gpu,
                 visualization=False,
                 shrink=shrink,
-                confs_threshold=confs_threshold)
+                confs_threshold=confs_threshold,
+                use_device=use_device)
             detect_faces = _detect_res[0]['data']
 
         element['preprocessed'] = list()
diff --git a/modules/image/face_detection/pyramidbox_lite_server_mask/module.py b/modules/image/face_detection/pyramidbox_lite_server_mask/module.py
index e4f8eac50fd65c0531001b1cf3412ba2a73464f4..b9b30bdd0e0a4141fbf502505a672cd6d84cd275 100644
--- a/modules/image/face_detection/pyramidbox_lite_server_mask/module.py
+++ b/modules/image/face_detection/pyramidbox_lite_server_mask/module.py
@@ -9,7 +9,10 @@ import os
 import numpy as np
 import paddle.fluid as fluid
 import paddlehub as hub
-from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
+
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
 from paddlehub.module.module import moduleinfo, runnable, serving
 
 from pyramidbox_lite_server_mask.data_feed import reader
@@ -30,8 +33,7 @@ class PyramidBoxLiteServerMask(hub.Module):
         Args:
             face_detector_module (class): module to detect face.
         """
-        self.default_pretrained_model_path = os.path.join(
-            self.directory, "pyramidbox_lite_server_mask_model")
+        self.default_pretrained_model_path = os.path.join(self.directory, "pyramidbox_lite_server_mask_model")
         if face_detector_module is None:
             self.face_detector = hub.Module(name='pyramidbox_lite_server')
         else:
@@ -39,27 +41,53 @@ class PyramidBoxLiteServerMask(hub.Module):
         self._set_config()
         self.processor = self
 
+    def _get_device_id(self, places):
+        try:
+            places = os.environ[places]
+            id = int(places)
+        except:
+            id = -1
+        return id
+
     def _set_config(self):
         """
         predictor config setting
         """
-        cpu_config = AnalysisConfig(self.default_pretrained_model_path)
+
+        # create default cpu predictor
+        cpu_config = Config(self.default_pretrained_model_path)
         cpu_config.disable_glog_info()
         cpu_config.disable_gpu()
-        self.cpu_predictor = create_paddle_predictor(cpu_config)
+        self.cpu_predictor = create_predictor(cpu_config)
 
-        try:
-            _places = os.environ["CUDA_VISIBLE_DEVICES"]
-            int(_places[0])
-            use_gpu = True
-        except:
-            use_gpu = False
-        if use_gpu:
-            gpu_config = AnalysisConfig(self.default_pretrained_model_path)
+        # create predictors using various types of devices
+
+        # npu
+        npu_id = self._get_device_id("FLAGS_selected_npus")
+        if npu_id != -1:
+            # use npu
+            npu_config = Config(self.default_pretrained_model_path)
+            npu_config.disable_glog_info()
+            npu_config.enable_npu(device_id=npu_id)
+            self.npu_predictor = create_predictor(npu_config)
+
+        # gpu
+        gpu_id = self._get_device_id("CUDA_VISIBLE_DEVICES")
+        if gpu_id != -1:
+            # use gpu
+            gpu_config = Config(self.default_pretrained_model_path)
             gpu_config.disable_glog_info()
-            gpu_config.enable_use_gpu(
-                memory_pool_init_size_mb=1000, device_id=0)
-            self.gpu_predictor = create_paddle_predictor(gpu_config)
+            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=gpu_id)
+            self.gpu_predictor = create_predictor(gpu_config)
+
+        # xpu
+        xpu_id = self._get_device_id("XPU_VISIBLE_DEVICES")
+        if xpu_id != -1:
+            # use xpu
+            xpu_config = Config(self.default_pretrained_model_path)
+            xpu_config.disable_glog_info()
+            xpu_config.enable_xpu(100)
+            self.xpu_predictor = create_predictor(xpu_config)
 
     def set_face_detector_module(self, face_detector_module):
         """
@@ -82,7 +110,8 @@ class PyramidBoxLiteServerMask(hub.Module):
                        output_dir='detection_result',
                        use_multi_scale=False,
                        shrink=0.5,
-                       confs_threshold=0.6):
+                       confs_threshold=0.6,
+                       use_device=None):
         """
         API for face detection.
 
@@ -97,18 +126,29 @@ class PyramidBoxLiteServerMask(hub.Module):
                 it reduce the prediction speed for the increase model calculation.
             shrink (float): parameter to control the resize scale in preprocess.
             confs_threshold (float): confidence threshold.
+            use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
 
         Returns:
             res (list[dict]): The result of face detection and save path of images.
         """
-        if use_gpu:
-            try:
-                _places = os.environ["CUDA_VISIBLE_DEVICES"]
-                int(_places[0])
-            except:
-                raise RuntimeError(
-                    "Attempt to use GPU for prediction, but environment variable CUDA_VISIBLE_DEVICES was not set correctly."
-                )
+        # real predictor to use
+        if use_device is not None:
+            if use_device == "cpu":
+                predictor = self.cpu_predictor
+            elif use_device == "xpu":
+                predictor = self.xpu_predictor
+            elif use_device == "npu":
+                predictor = self.npu_predictor
+            elif use_device == "gpu":
+                predictor = self.gpu_predictor
+            else:
+                raise Exception("Unsupported device: " + use_device)
+        else:
+            # use_device is not set, therefore follow use_gpu
+            if use_gpu:
+                predictor = self.gpu_predictor
+            else:
+                predictor = self.cpu_predictor
 
         # compatibility with older versions
         if data:
@@ -123,16 +163,14 @@ class PyramidBoxLiteServerMask(hub.Module):
 
         # get all data
         all_element = list()
-        for yield_data in reader(self.face_detector, shrink, confs_threshold,
-                                 images, paths, use_gpu, use_multi_scale):
+        for yield_data in reader(self.face_detector, shrink, confs_threshold, images, paths, use_gpu, use_multi_scale,
+                                 use_device):
             all_element.append(yield_data)
 
         image_list = list()
         element_image_num = list()
         for i in range(len(all_element)):
-            element_image = [
-                handled['image'] for handled in all_element[i]['preprocessed']
-            ]
+            element_image = [handled['image'] for handled in all_element[i]['preprocessed']]
             element_image_num.append(len(element_image))
             image_list.extend(element_image)
 
@@ -149,23 +187,24 @@ class PyramidBoxLiteServerMask(hub.Module):
                 except:
                     pass
 
-            image_arr = np.squeeze(np.array(batch_data), axis=1)
-            image_tensor = PaddleTensor(image_arr.copy())
-            data_out = self.gpu_predictor.run([
-                image_tensor
-            ]) if use_gpu else self.cpu_predictor.run([image_tensor])
-            # len(data_out) == 1
-            # data_out[0].as_ndarray().shape == (-1, 2)
-            data_out = data_out[0].as_ndarray()
-            predict_out = np.concatenate((predict_out, data_out))
+            batch_image = np.squeeze(np.array(batch_data), axis=1)
+
+            input_names = predictor.get_input_names()
+            input_tensor = predictor.get_input_handle(input_names[0])
+            input_tensor.reshape(batch_image.shape)
+            input_tensor.copy_from_cpu(batch_image.copy())
+            predictor.run()
+            output_names = predictor.get_output_names()
+            output_handle = predictor.get_output_handle(output_names[0])
+            predictor_output = output_handle.copy_to_cpu()
+
+            predict_out = np.concatenate((predict_out, predictor_output))
 
         predict_out = predict_out[1:]
         # postprocess one by one
         res = list()
         for i in range(len(all_element)):
-            detect_faces_list = [
-                handled['face'] for handled in all_element[i]['preprocessed']
-            ]
+            detect_faces_list = [handled['face'] for handled in all_element[i]['preprocessed']]
             interval_left = sum(element_image_num[0:i])
             interval_right = interval_left + element_image_num[i]
             out = postprocess(
@@ -178,31 +217,16 @@ class PyramidBoxLiteServerMask(hub.Module):
             res.append(out)
         return res
 
-    def save_inference_model(self,
-                             dirname,
-                             model_filename=None,
-                             params_filename=None,
-                             combined=True):
+    def save_inference_model(self, dirname, model_filename=None, params_filename=None, combined=True):
         classifier_dir = os.path.join(dirname, 'mask_detector')
         detector_dir = os.path.join(dirname, 'pyramidbox_lite')
-        self._save_classifier_model(classifier_dir, model_filename,
-                                    params_filename, combined)
-        self._save_detector_model(detector_dir, model_filename, params_filename,
-                                  combined)
-
-    def _save_detector_model(self,
-                             dirname,
-                             model_filename=None,
-                             params_filename=None,
-                             combined=True):
-        self.face_detector.save_inference_model(dirname, model_filename,
-                                                params_filename, combined)
-
-    def _save_classifier_model(self,
-                               dirname,
-                               model_filename=None,
-                               params_filename=None,
-                               combined=True):
+        self._save_classifier_model(classifier_dir, model_filename, params_filename, combined)
+        self._save_detector_model(detector_dir, model_filename, params_filename, combined)
+
+    def _save_detector_model(self, dirname, model_filename=None, params_filename=None, combined=True):
+        self.face_detector.save_inference_model(dirname, model_filename, params_filename, combined)
+
+    def _save_classifier_model(self, dirname, model_filename=None, params_filename=None, combined=True):
         if combined:
             model_filename = "__model__" if not model_filename else model_filename
             params_filename = "__params__" if not params_filename else params_filename
@@ -240,12 +264,9 @@ class PyramidBoxLiteServerMask(hub.Module):
             prog='hub run {}'.format(self.name),
             usage='%(prog)s',
             add_help=True)
-        self.arg_input_group = self.parser.add_argument_group(
-            title="Input options", description="Input data. Required")
+        self.arg_input_group = self.parser.add_argument_group(title="Input options", description="Input data. Required")
         self.arg_config_group = self.parser.add_argument_group(
-            title="Config options",
-            description=
-            "Run configuration for controlling module behavior, not required.")
+            title="Config options", description="Run configuration for controlling module behavior, not required.")
         self.add_module_config_arg()
         self.add_module_input_arg()
         args = self.parser.parse_args(argvs)
@@ -255,7 +276,8 @@ class PyramidBoxLiteServerMask(hub.Module):
             output_dir=args.output_dir,
             visualization=args.visualization,
             shrink=args.shrink,
-            confs_threshold=args.confs_threshold)
+            confs_threshold=args.confs_threshold,
+            use_device=args.use_device)
         return results
 
     def add_module_config_arg(self):
@@ -263,36 +285,25 @@ class PyramidBoxLiteServerMask(hub.Module):
         Add the command config options.
         """
         self.arg_config_group.add_argument(
-            '--use_gpu',
-            type=ast.literal_eval,
-            default=False,
-            help="whether use GPU or not")
+            '--use_gpu', type=ast.literal_eval, default=False, help="whether use GPU or not")
         self.arg_config_group.add_argument(
-            '--output_dir',
-            type=str,
-            default='detection_result',
-            help="The directory to save output images.")
+            '--output_dir', type=str, default='detection_result', help="The directory to save output images.")
         self.arg_config_group.add_argument(
-            '--visualization',
-            type=ast.literal_eval,
-            default=False,
-            help="whether to save output as images.")
+            '--visualization', type=ast.literal_eval, default=False, help="whether to save output as images.")
+        self.arg_config_group.add_argument(
+            '--use_device',
+            choices=["cpu", "gpu", "xpu", "npu"],
+            help="use cpu, gpu, xpu or npu. overwrites use_gpu flag.")
 
     def add_module_input_arg(self):
         """
         Add the command input options.
         """
-        self.arg_input_group.add_argument(
-            '--input_path', type=str, help="path to image.")
+        self.arg_input_group.add_argument('--input_path', type=str, help="path to image.")
         self.arg_input_group.add_argument(
             '--shrink',
             type=ast.literal_eval,
             default=0.5,
-            help=
-            "resize the image to `shrink * original_shape` before feeding into network."
-        )
+            help="resize the image to `shrink * original_shape` before feeding into network.")
         self.arg_input_group.add_argument(
-            '--confs_threshold',
-            type=ast.literal_eval,
-            default=0.6,
-            help="confidence threshold.")
+            '--confs_threshold', type=ast.literal_eval, default=0.6, help="confidence threshold.")