diff --git a/modules/image/classification/mobilenet_v2_animals/module.py b/modules/image/classification/mobilenet_v2_animals/module.py
index c691a41e90c60def9b42a8fc246e2a900e86ee01..b9d62aa7c5f9468deb00a407f75ccead0544e192 100644
--- a/modules/image/classification/mobilenet_v2_animals/module.py
+++ b/modules/image/classification/mobilenet_v2_animals/module.py
@@ -9,7 +9,10 @@ import os
 import numpy as np
 import paddle.fluid as fluid
 import paddlehub as hub
-from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
+
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
 from paddlehub.module.module import moduleinfo, runnable, serving
 from paddlehub.common.paddle_helper import add_vars_prefix
 
@@ -48,26 +51,53 @@ class MobileNetV2Animals(hub.Module):
         im_std = np.array([0.229, 0.224, 0.225]).reshape(1, 3)
         return im_std
 
+    def _get_device_id(self, places):
+        try:
+            places = os.environ[places]
+            id = int(places)
+        except:
+            id = -1
+        return id
+
     def _set_config(self):
         """
         predictor config setting
         """
-        cpu_config = AnalysisConfig(self.default_pretrained_model_path)
+
+        # create default cpu predictor
+        cpu_config = Config(self.default_pretrained_model_path)
         cpu_config.disable_glog_info()
         cpu_config.disable_gpu()
-        self.cpu_predictor = create_paddle_predictor(cpu_config)
+        self.cpu_predictor = create_predictor(cpu_config)
 
-        try:
-            _places = os.environ["CUDA_VISIBLE_DEVICES"]
-            int(_places[0])
-            use_gpu = True
-        except:
-            use_gpu = False
-        if use_gpu:
-            gpu_config = AnalysisConfig(self.default_pretrained_model_path)
+        # create predictors using various types of devices
+
+        # npu
+        npu_id = self._get_device_id("FLAGS_selected_npus")
+        if npu_id != -1:
+            # use npu
+            npu_config = Config(self.default_pretrained_model_path)
+            npu_config.disable_glog_info()
+            npu_config.enable_npu(device_id=npu_id)
+            self.npu_predictor = create_predictor(npu_config)
+
+        # gpu
+        gpu_id = self._get_device_id("CUDA_VISIBLE_DEVICES")
+        if gpu_id != -1:
+            # use gpu
+            gpu_config = Config(self.default_pretrained_model_path)
             gpu_config.disable_glog_info()
-            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)
-            self.gpu_predictor = create_paddle_predictor(gpu_config)
+            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=gpu_id)
+            self.gpu_predictor = create_predictor(gpu_config)
+
+        # xpu
+        xpu_id = self._get_device_id("XPU_VISIBLE_DEVICES")
+        if xpu_id != -1:
+            # use xpu
+            xpu_config = Config(self.default_pretrained_model_path)
+            xpu_config.disable_glog_info()
+            xpu_config.enable_xpu(100)
+            self.xpu_predictor = create_predictor(xpu_config)
 
     def context(self, trainable=True, pretrained=True):
         """context for transfer learning.
@@ -117,7 +147,7 @@ class MobileNetV2Animals(hub.Module):
                     param.trainable = trainable
         return inputs, outputs, context_prog
 
-    def classification(self, images=None, paths=None, batch_size=1, use_gpu=False, top_k=1):
+    def classification(self, images=None, paths=None, batch_size=1, use_gpu=False, top_k=1, use_device=None):
         """
         API for image classification.
 
@@ -127,18 +157,29 @@ class MobileNetV2Animals(hub.Module):
             batch_size (int): batch size.
             use_gpu (bool): Whether to use gpu.
             top_k (int): Return top k results.
+            use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
 
         Returns:
             res (list[dict]): The classfication results.
         """
-        if use_gpu:
-            try:
-                _places = os.environ["CUDA_VISIBLE_DEVICES"]
-                int(_places[0])
-            except:
-                raise RuntimeError(
-                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id."
-                )
+        # real predictor to use
+        if use_device is not None:
+            if use_device == "cpu":
+                predictor = self.cpu_predictor
+            elif use_device == "xpu":
+                predictor = self.xpu_predictor
+            elif use_device == "npu":
+                predictor = self.npu_predictor
+            elif use_device == "gpu":
+                predictor = self.gpu_predictor
+            else:
+                raise Exception("Unsupported device: " + use_device)
+        else:
+            # use_device is not set, therefore follow use_gpu
+            if use_gpu:
+                predictor = self.gpu_predictor
+            else:
+                predictor = self.cpu_predictor
 
         all_data = list()
         for yield_data in reader(images, paths):
@@ -158,10 +199,16 @@ class MobileNetV2Animals(hub.Module):
                     pass
             # feed batch image
             batch_image = np.array([data['image'] for data in batch_data])
-            batch_image = PaddleTensor(batch_image.copy())
-            predictor_output = self.gpu_predictor.run([batch_image]) if use_gpu else self.cpu_predictor.run(
-                [batch_image])
-            out = postprocess(data_out=predictor_output[0].as_ndarray(), label_list=self.label_list, top_k=top_k)
+
+            input_names = predictor.get_input_names()
+            input_tensor = predictor.get_input_handle(input_names[0])
+            input_tensor.reshape(batch_image.shape)
+            input_tensor.copy_from_cpu(batch_image.copy())
+            predictor.run()
+            output_names = predictor.get_output_names()
+            output_handle = predictor.get_output_handle(output_names[0])
+            predictor_output = output_handle.copy_to_cpu()
+            out = postprocess(data_out=predictor_output, label_list=self.label_list, top_k=top_k)
             res += out
         return res
 
@@ -209,7 +256,12 @@ class MobileNetV2Animals(hub.Module):
         self.add_module_config_arg()
         self.add_module_input_arg()
         args = self.parser.parse_args(argvs)
-        results = self.classification(paths=[args.input_path], batch_size=args.batch_size, use_gpu=args.use_gpu)
+        results = self.classification(
+            paths=[args.input_path],
+            batch_size=args.batch_size,
+            use_gpu=args.use_gpu,
+            top_k=args.top_k,
+            use_device=args.use_device)
         return results
 
     def add_module_config_arg(self):
@@ -220,6 +272,10 @@ class MobileNetV2Animals(hub.Module):
             '--use_gpu', type=ast.literal_eval, default=False, help="whether use GPU or not.")
         self.arg_config_group.add_argument('--batch_size', type=ast.literal_eval, default=1, help="batch size.")
         self.arg_config_group.add_argument('--top_k', type=ast.literal_eval, default=1, help="Return top k results.")
+        self.arg_config_group.add_argument(
+            '--use_device',
+            choices=["cpu", "gpu", "xpu", "npu"],
+            help="use cpu, gpu, xpu or npu. overwrites use_gpu flag.")
 
     def add_module_input_arg(self):
         """
diff --git a/modules/image/classification/resnet50_vd_dishes/module.py b/modules/image/classification/resnet50_vd_dishes/module.py
index 43c072b7f81ab9e02749bcc12734c454b2da8a85..a276a344a65d049d0e57b6ed2bf756ed8c5f4f3b 100644
--- a/modules/image/classification/resnet50_vd_dishes/module.py
+++ b/modules/image/classification/resnet50_vd_dishes/module.py
@@ -9,7 +9,10 @@ import os
 import numpy as np
 import paddle.fluid as fluid
 import paddlehub as hub
-from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
+
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
 from paddlehub.module.module import moduleinfo, runnable, serving
 from paddlehub.common.paddle_helper import add_vars_prefix
 
@@ -47,26 +50,53 @@ class ResNet50vdDishes(hub.Module):
         im_std = np.array([0.229, 0.224, 0.225]).reshape(1, 3)
         return im_std
 
+    def _get_device_id(self, places):
+        try:
+            places = os.environ[places]
+            id = int(places)
+        except:
+            id = -1
+        return id
+
     def _set_config(self):
         """
         predictor config setting
         """
-        cpu_config = AnalysisConfig(self.default_pretrained_model_path)
+
+        # create default cpu predictor
+        cpu_config = Config(self.default_pretrained_model_path)
         cpu_config.disable_glog_info()
         cpu_config.disable_gpu()
-        self.cpu_predictor = create_paddle_predictor(cpu_config)
+        self.cpu_predictor = create_predictor(cpu_config)
 
-        try:
-            _places = os.environ["CUDA_VISIBLE_DEVICES"]
-            int(_places[0])
-            use_gpu = True
-        except:
-            use_gpu = False
-        if use_gpu:
-            gpu_config = AnalysisConfig(self.default_pretrained_model_path)
+        # create predictors using various types of devices
+
+        # npu
+        npu_id = self._get_device_id("FLAGS_selected_npus")
+        if npu_id != -1:
+            # use npu
+            npu_config = Config(self.default_pretrained_model_path)
+            npu_config.disable_glog_info()
+            npu_config.enable_npu(device_id=npu_id)
+            self.npu_predictor = create_predictor(npu_config)
+
+        # gpu
+        gpu_id = self._get_device_id("CUDA_VISIBLE_DEVICES")
+        if gpu_id != -1:
+            # use gpu
+            gpu_config = Config(self.default_pretrained_model_path)
             gpu_config.disable_glog_info()
-            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)
-            self.gpu_predictor = create_paddle_predictor(gpu_config)
+            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=gpu_id)
+            self.gpu_predictor = create_predictor(gpu_config)
+
+        # xpu
+        xpu_id = self._get_device_id("XPU_VISIBLE_DEVICES")
+        if xpu_id != -1:
+            # use xpu
+            xpu_config = Config(self.default_pretrained_model_path)
+            xpu_config.disable_glog_info()
+            xpu_config.enable_xpu(100)
+            self.xpu_predictor = create_predictor(xpu_config)
 
     def context(self, trainable=True, pretrained=True):
         """context for transfer learning.
@@ -116,7 +146,7 @@ class ResNet50vdDishes(hub.Module):
                     param.trainable = trainable
         return inputs, outputs, context_prog
 
-    def classification(self, images=None, paths=None, batch_size=1, use_gpu=False, top_k=1):
+    def classification(self, images=None, paths=None, batch_size=1, use_gpu=False, top_k=1, use_device=None):
         """
         API for image classification.
 
@@ -126,18 +156,29 @@ class ResNet50vdDishes(hub.Module):
             batch_size (int): batch size.
             use_gpu (bool): Whether to use gpu.
             top_k (int): Return top k results.
+            use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
 
         Returns:
             res (list[dict]): The classfication results.
         """
-        if use_gpu:
-            try:
-                _places = os.environ["CUDA_VISIBLE_DEVICES"]
-                int(_places[0])
-            except:
-                raise RuntimeError(
-                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id."
-                )
+        # real predictor to use
+        if use_device is not None:
+            if use_device == "cpu":
+                predictor = self.cpu_predictor
+            elif use_device == "xpu":
+                predictor = self.xpu_predictor
+            elif use_device == "npu":
+                predictor = self.npu_predictor
+            elif use_device == "gpu":
+                predictor = self.gpu_predictor
+            else:
+                raise Exception("Unsupported device: " + use_device)
+        else:
+            # use_device is not set, therefore follow use_gpu
+            if use_gpu:
+                predictor = self.gpu_predictor
+            else:
+                predictor = self.cpu_predictor
 
         all_data = list()
         for yield_data in reader(images, paths):
@@ -157,10 +198,16 @@ class ResNet50vdDishes(hub.Module):
                     pass
             # feed batch image
             batch_image = np.array([data['image'] for data in batch_data])
-            batch_image = PaddleTensor(batch_image.copy())
-            predictor_output = self.gpu_predictor.run([batch_image]) if use_gpu else self.cpu_predictor.run(
-                [batch_image])
-            out = postprocess(data_out=predictor_output[0].as_ndarray(), label_list=self.label_list, top_k=top_k)
+
+            input_names = predictor.get_input_names()
+            input_tensor = predictor.get_input_handle(input_names[0])
+            input_tensor.reshape(batch_image.shape)
+            input_tensor.copy_from_cpu(batch_image.copy())
+            predictor.run()
+            output_names = predictor.get_output_names()
+            output_handle = predictor.get_output_handle(output_names[0])
+            predictor_output = output_handle.copy_to_cpu()
+            out = postprocess(data_out=predictor_output, label_list=self.label_list, top_k=top_k)
             res += out
         return res
 
@@ -208,7 +255,12 @@ class ResNet50vdDishes(hub.Module):
         self.add_module_config_arg()
         self.add_module_input_arg()
         args = self.parser.parse_args(argvs)
-        results = self.classification(paths=[args.input_path], batch_size=args.batch_size, use_gpu=args.use_gpu)
+        results = self.classification(
+            paths=[args.input_path],
+            batch_size=args.batch_size,
+            use_gpu=args.use_gpu,
+            top_k=args.top_k,
+            use_device=args.use_device)
         return results
 
     def add_module_config_arg(self):
@@ -219,6 +271,10 @@ class ResNet50vdDishes(hub.Module):
             '--use_gpu', type=ast.literal_eval, default=False, help="whether use GPU or not.")
         self.arg_config_group.add_argument('--batch_size', type=ast.literal_eval, default=1, help="batch size.")
         self.arg_config_group.add_argument('--top_k', type=ast.literal_eval, default=1, help="Return top k results.")
+        self.arg_config_group.add_argument(
+            '--use_device',
+            choices=["cpu", "gpu", "xpu", "npu"],
+            help="use cpu, gpu, xpu or npu. overwrites use_gpu flag.")
 
     def add_module_input_arg(self):
         """
diff --git a/modules/image/classification/resnet50_vd_wildanimals/module.py b/modules/image/classification/resnet50_vd_wildanimals/module.py
index e3ab6e73b35da2c8ca6d955fd8a864a284018434..af408ac4fa78cda493bdb51163e1fa187d1135e7 100644
--- a/modules/image/classification/resnet50_vd_wildanimals/module.py
+++ b/modules/image/classification/resnet50_vd_wildanimals/module.py
@@ -9,7 +9,10 @@ import os
 import numpy as np
 import paddle.fluid as fluid
 import paddlehub as hub
-from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
+
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
 from paddlehub.module.module import moduleinfo, runnable, serving
 from paddlehub.common.paddle_helper import add_vars_prefix
 
@@ -48,26 +51,53 @@ class ResNet50vdWildAnimals(hub.Module):
         im_std = np.array([0.229, 0.224, 0.225]).reshape(1, 3)
         return im_std
 
+    def _get_device_id(self, places):
+        try:
+            places = os.environ[places]
+            id = int(places)
+        except:
+            id = -1
+        return id
+
     def _set_config(self):
         """
         predictor config setting.
         """
-        cpu_config = AnalysisConfig(self.default_pretrained_model_path)
+
+        # create default cpu predictor
+        cpu_config = Config(self.default_pretrained_model_path)
         cpu_config.disable_glog_info()
         cpu_config.disable_gpu()
-        self.cpu_predictor = create_paddle_predictor(cpu_config)
+        self.cpu_predictor = create_predictor(cpu_config)
 
-        try:
-            _places = os.environ["CUDA_VISIBLE_DEVICES"]
-            int(_places[0])
-            use_gpu = True
-        except:
-            use_gpu = False
-        if use_gpu:
-            gpu_config = AnalysisConfig(self.default_pretrained_model_path)
+        # create predictors using various types of devices
+
+        # npu
+        npu_id = self._get_device_id("FLAGS_selected_npus")
+        if npu_id != -1:
+            # use npu
+            npu_config = Config(self.default_pretrained_model_path)
+            npu_config.disable_glog_info()
+            npu_config.enable_npu(device_id=npu_id)
+            self.npu_predictor = create_predictor(npu_config)
+
+        # gpu
+        gpu_id = self._get_device_id("CUDA_VISIBLE_DEVICES")
+        if gpu_id != -1:
+            # use gpu
+            gpu_config = Config(self.default_pretrained_model_path)
             gpu_config.disable_glog_info()
-            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)
-            self.gpu_predictor = create_paddle_predictor(gpu_config)
+            gpu_config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=gpu_id)
+            self.gpu_predictor = create_predictor(gpu_config)
+
+        # xpu
+        xpu_id = self._get_device_id("XPU_VISIBLE_DEVICES")
+        if xpu_id != -1:
+            # use xpu
+            xpu_config = Config(self.default_pretrained_model_path)
+            xpu_config.disable_glog_info()
+            xpu_config.enable_xpu(100)
+            self.xpu_predictor = create_predictor(xpu_config)
 
     def context(self, trainable=True, pretrained=True):
         """context for transfer learning.
@@ -117,7 +147,7 @@ class ResNet50vdWildAnimals(hub.Module):
                     param.trainable = trainable
         return inputs, outputs, context_prog
 
-    def classification(self, images=None, paths=None, batch_size=1, use_gpu=False, top_k=1):
+    def classification(self, images=None, paths=None, batch_size=1, use_gpu=False, top_k=1, use_device=None):
         """
         API for image classification.
 
@@ -127,18 +157,29 @@ class ResNet50vdWildAnimals(hub.Module):
             batch_size (int): batch size.
             use_gpu (bool): Whether to use gpu.
             top_k (int): Return top k results.
+            use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
 
         Returns:
             res (list[dict]): The classfication results.
         """
-        if use_gpu:
-            try:
-                _places = os.environ["CUDA_VISIBLE_DEVICES"]
-                int(_places[0])
-            except:
-                raise RuntimeError(
-                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id."
-                )
+        # real predictor to use
+        if use_device is not None:
+            if use_device == "cpu":
+                predictor = self.cpu_predictor
+            elif use_device == "xpu":
+                predictor = self.xpu_predictor
+            elif use_device == "npu":
+                predictor = self.npu_predictor
+            elif use_device == "gpu":
+                predictor = self.gpu_predictor
+            else:
+                raise Exception("Unsupported device: " + use_device)
+        else:
+            # use_device is not set, therefore follow use_gpu
+            if use_gpu:
+                predictor = self.gpu_predictor
+            else:
+                predictor = self.cpu_predictor
 
         all_data = list()
         for yield_data in reader(images, paths):
@@ -158,10 +199,16 @@ class ResNet50vdWildAnimals(hub.Module):
                     pass
             # feed batch image
             batch_image = np.array([data['image'] for data in batch_data])
-            batch_image = PaddleTensor(batch_image.copy())
-            predictor_output = self.gpu_predictor.run([batch_image]) if use_gpu else self.cpu_predictor.run(
-                [batch_image])
-            out = postprocess(data_out=predictor_output[0].as_ndarray(), label_list=self.label_list, top_k=top_k)
+
+            input_names = predictor.get_input_names()
+            input_tensor = predictor.get_input_handle(input_names[0])
+            input_tensor.reshape(batch_image.shape)
+            input_tensor.copy_from_cpu(batch_image.copy())
+            predictor.run()
+            output_names = predictor.get_output_names()
+            output_handle = predictor.get_output_handle(output_names[0])
+            predictor_output = output_handle.copy_to_cpu()
+            out = postprocess(data_out=predictor_output, label_list=self.label_list, top_k=top_k)
             res += out
         return res
 
@@ -209,7 +256,12 @@ class ResNet50vdWildAnimals(hub.Module):
         self.add_module_config_arg()
         self.add_module_input_arg()
         args = self.parser.parse_args(argvs)
-        results = self.classification(paths=[args.input_path], batch_size=args.batch_size, use_gpu=args.use_gpu)
+        results = self.classification(
+            paths=[args.input_path],
+            batch_size=args.batch_size,
+            use_gpu=args.use_gpu,
+            top_k=args.top_k,
+            use_device=args.use_device)
         return results
 
     def add_module_config_arg(self):
@@ -220,6 +272,10 @@ class ResNet50vdWildAnimals(hub.Module):
             '--use_gpu', type=ast.literal_eval, default=False, help="whether use GPU or not.")
         self.arg_config_group.add_argument('--batch_size', type=ast.literal_eval, default=1, help="batch size.")
         self.arg_config_group.add_argument('--top_k', type=ast.literal_eval, default=1, help="Return top k results.")
+        self.arg_config_group.add_argument(
+            '--use_device',
+            choices=["cpu", "gpu", "xpu", "npu"],
+            help="use cpu, gpu, xpu or npu. overwrites use_gpu flag.")
 
     def add_module_input_arg(self):
         """
diff --git a/modules/text/sentiment_analysis/emotion_detection_textcnn/module.py b/modules/text/sentiment_analysis/emotion_detection_textcnn/module.py
index bfe7d54f85963f6b054e3c89e564a8fdee511969..c125f39c72b9099e7ef2bafde117c0b4ee3463af 100644
--- a/modules/text/sentiment_analysis/emotion_detection_textcnn/module.py
+++ b/modules/text/sentiment_analysis/emotion_detection_textcnn/module.py
@@ -151,7 +151,7 @@ class EmotionDetectionTextCNN(hub.NLPPredictionModule):
             return inputs, outputs, main_program
 
     @serving
-    def emotion_classify(self, texts=[], data={}, use_gpu=False, batch_size=1):
+    def emotion_classify(self, texts=[], data={}, use_gpu=False, batch_size=1, use_device=None):
         """
         Get the emotion prediction results results with the texts as input
         Args:
@@ -161,15 +161,26 @@ class EmotionDetectionTextCNN(hub.NLPPredictionModule):
              batch_size(int): the program deals once with one batch
         Returns:
              results(list): the emotion prediction results
+             use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
         """
-        if use_gpu:
-            try:
-                _places = os.environ["CUDA_VISIBLE_DEVICES"]
-                int(_places[0])
-            except:
-                raise RuntimeError(
-                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id."
-                )
+        # real predictor to use
+        if use_device is not None:
+            if use_device == "cpu":
+                predictor = self.cpu_predictor
+            elif use_device == "xpu":
+                predictor = self.xpu_predictor
+            elif use_device == "npu":
+                predictor = self.npu_predictor
+            elif use_device == "gpu":
+                predictor = self.gpu_predictor
+            else:
+                raise Exception("Unsupported device: " + use_device)
+        else:
+            # use_device is not set, therefore follow use_gpu
+            if use_gpu:
+                predictor = self.gpu_predictor
+            else:
+                predictor = self.cpu_predictor
 
         if texts != [] and isinstance(texts, list) and data == {}:
             predicted_data = texts
@@ -189,14 +200,10 @@ class EmotionDetectionTextCNN(hub.NLPPredictionModule):
             else:
                 batch_data = predicted_data[start_idx:]
             start_idx = start_idx + batch_size
-            processed_results = preprocess(self.word_seg_module, batch_data, self.vocab, use_gpu, batch_size)
-            tensor_words = self.texts2tensor(processed_results)
-
-            if use_gpu:
-                batch_out = self.gpu_predictor.run([tensor_words])
-            else:
-                batch_out = self.cpu_predictor.run([tensor_words])
-            batch_result = postprocess(batch_out[0], processed_results)
+            processed_results = preprocess(self.word_seg_module, batch_data, self.vocab, use_gpu, batch_size,
+                                           use_device)
+            predictor_output = self._internal_predict(predictor, processed_results)
+            batch_result = postprocess(predictor_output, processed_results)
             results += batch_result
         return results
 
diff --git a/modules/text/sentiment_analysis/emotion_detection_textcnn/processor.py b/modules/text/sentiment_analysis/emotion_detection_textcnn/processor.py
index 10bd655af5cac809b51b2e10ad8325f9272a90e5..9c0f777fe636677d5d8636018207794dbcb22371 100644
--- a/modules/text/sentiment_analysis/emotion_detection_textcnn/processor.py
+++ b/modules/text/sentiment_analysis/emotion_detection_textcnn/processor.py
@@ -34,10 +34,10 @@ def get_predict_label(probs):
     return label, key
 
 
-def preprocess(lac, predicted_data, word_dict, use_gpu=False, batch_size=1):
+def preprocess(lac, predicted_data, word_dict, use_gpu=False, batch_size=1, use_device=None):
     result = []
     data_dict = {"text": predicted_data}
-    processed = lac.lexical_analysis(data=data_dict, use_gpu=use_gpu, batch_size=batch_size)
+    processed = lac.lexical_analysis(data=data_dict, use_gpu=use_gpu, batch_size=batch_size, use_device=use_device)
     unk_id = word_dict["<unk>"]
     for index, data in enumerate(processed):
         result_i = {'processed': []}
@@ -54,7 +54,7 @@ def preprocess(lac, predicted_data, word_dict, use_gpu=False, batch_size=1):
 
 def postprocess(prediction, texts):
     result = []
-    pred = prediction.as_ndarray()
+    pred = prediction.copy_to_cpu()
     for index in range(len(texts)):
         result_i = {}
         result_i['text'] = texts[index]['origin']
diff --git a/modules/text/sentiment_analysis/senta_bilstm/module.py b/modules/text/sentiment_analysis/senta_bilstm/module.py
index 0ee5ca73e0e2e2134802096c856870a276f0a6e3..42d80f18a8367f36009759d8281cd52ab6325b3e 100644
--- a/modules/text/sentiment_analysis/senta_bilstm/module.py
+++ b/modules/text/sentiment_analysis/senta_bilstm/module.py
@@ -153,7 +153,7 @@ class SentaBiLSTM(hub.NLPPredictionModule):
             return inputs, outputs, main_program
 
     @serving
-    def sentiment_classify(self, texts=[], data={}, use_gpu=False, batch_size=1):
+    def sentiment_classify(self, texts=[], data={}, use_gpu=False, batch_size=1, use_device=None):
         """
         Get the sentiment prediction results results with the texts as input
 
@@ -162,18 +162,29 @@ class SentaBiLSTM(hub.NLPPredictionModule):
              data(dict): key must be 'text', value is the texts to be predicted, if data not texts
              use_gpu(bool): whether use gpu to predict or not
              batch_size(int): the program deals once with one batch
+             use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
 
         Returns:
              results(list): the word segmentation results
         """
-        if use_gpu:
-            try:
-                _places = os.environ["CUDA_VISIBLE_DEVICES"]
-                int(_places[0])
-            except:
-                raise RuntimeError(
-                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id."
-                )
+        # real predictor to use
+        if use_device is not None:
+            if use_device == "cpu":
+                predictor = self.cpu_predictor
+            elif use_device == "xpu":
+                predictor = self.xpu_predictor
+            elif use_device == "npu":
+                predictor = self.npu_predictor
+            elif use_device == "gpu":
+                predictor = self.gpu_predictor
+            else:
+                raise Exception("Unsupported device: " + use_device)
+        else:
+            # use_device is not set, therefore follow use_gpu
+            if use_gpu:
+                predictor = self.gpu_predictor
+            else:
+                predictor = self.cpu_predictor
 
         if texts != [] and isinstance(texts, list) and data == {}:
             predicted_data = texts
@@ -193,14 +204,10 @@ class SentaBiLSTM(hub.NLPPredictionModule):
                 batch_data = predicted_data[start_idx:]
 
             start_idx = start_idx + batch_size
-            processed_results = preprocess(self.word_seg_module, batch_data, self.word_dict, use_gpu, batch_size)
-            tensor_words = self.texts2tensor(processed_results)
-
-            if use_gpu:
-                batch_out = self.gpu_predictor.run([tensor_words])
-            else:
-                batch_out = self.cpu_predictor.run([tensor_words])
-            batch_result = postprocess(batch_out[0], processed_results)
+            processed_results = preprocess(self.word_seg_module, batch_data, self.word_dict, use_gpu, batch_size,
+                                           use_device)
+            predictor_output = self._internal_predict(predictor, processed_results)
+            batch_result = postprocess(predictor_output, processed_results)
             results += batch_result
         return results
 
diff --git a/modules/text/sentiment_analysis/senta_bilstm/processor.py b/modules/text/sentiment_analysis/senta_bilstm/processor.py
index 39190cf3a7c02a5e7974f32329a584f40db81832..f181d0e08b14b7b63eb60f4d9ef01aa9001c0b99 100644
--- a/modules/text/sentiment_analysis/senta_bilstm/processor.py
+++ b/modules/text/sentiment_analysis/senta_bilstm/processor.py
@@ -17,14 +17,14 @@ def load_vocab(file_path):
     return vocab
 
 
-def preprocess(lac, texts, word_dict, use_gpu=False, batch_size=1):
+def preprocess(lac, texts, word_dict, use_gpu=False, batch_size=1, use_device=None):
     """
     firstly, the predicted texts are segmented by lac module
     then, the word segmention results input into senta
     """
     result = []
     input_dict = {'text': texts}
-    processed = lac.lexical_analysis(data=input_dict, use_gpu=use_gpu, batch_size=batch_size)
+    processed = lac.lexical_analysis(data=input_dict, use_gpu=use_gpu, batch_size=batch_size, use_device=use_device)
     unk_id = word_dict["<unk>"]
     for index, data in enumerate(processed):
         result_i = {'processed': []}
@@ -43,7 +43,7 @@ def postprocess(predict_out, texts):
     """
     Convert model's output tensor to sentiment label
     """
-    predict_out = predict_out.as_ndarray()
+    predict_out = predict_out.copy_to_cpu()
     batch_size = len(texts)
     result = []
     for index in range(batch_size):
diff --git a/modules/text/text_review/porn_detection_lstm/module.py b/modules/text/text_review/porn_detection_lstm/module.py
index e1b7778a5529a91b0531589b954241e92fc2f041..d6c2e2af47ae456a13b108b0cf9222cd66c19544 100644
--- a/modules/text/text_review/porn_detection_lstm/module.py
+++ b/modules/text/text_review/porn_detection_lstm/module.py
@@ -78,7 +78,7 @@ class PornDetectionLSTM(hub.NLPPredictionModule):
         return inputs, outputs, program
 
     @serving
-    def detection(self, texts=[], data={}, use_gpu=False, batch_size=1):
+    def detection(self, texts=[], data={}, use_gpu=False, batch_size=1, use_device=None):
         """
         Get the porn prediction results results with the texts as input
 
@@ -87,15 +87,29 @@ class PornDetectionLSTM(hub.NLPPredictionModule):
              data(dict): key must be 'text', value is the texts to be predicted, if data not texts
              use_gpu(bool): whether use gpu to predict or not
              batch_size(int): the program deals once with one batch
+             use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.
 
         Returns:
              results(list): the porn prediction results
         """
-        try:
-            _places = os.environ["CUDA_VISIBLE_DEVICES"]
-            int(_places[0])
-        except:
-            use_gpu = False
+        # real predictor to use
+        if use_device is not None:
+            if use_device == "cpu":
+                predictor = self.cpu_predictor
+            elif use_device == "xpu":
+                predictor = self.xpu_predictor
+            elif use_device == "npu":
+                predictor = self.npu_predictor
+            elif use_device == "gpu":
+                predictor = self.gpu_predictor
+            else:
+                raise Exception("Unsupported device: " + use_device)
+        else:
+            # use_device is not set, therefore follow use_gpu
+            if use_gpu:
+                predictor = self.gpu_predictor
+            else:
+                predictor = self.cpu_predictor
 
         if texts != [] and isinstance(texts, list) and data == {}:
             predicted_data = texts
@@ -116,13 +130,8 @@ class PornDetectionLSTM(hub.NLPPredictionModule):
 
             start_idx = start_idx + batch_size
             processed_results = preprocess(batch_data, self.tokenizer, self.vocab, self.sequence_max_len)
-            tensor_words = self.texts2tensor(processed_results)
-
-            if use_gpu:
-                batch_out = self.gpu_predictor.run([tensor_words])
-            else:
-                batch_out = self.cpu_predictor.run([tensor_words])
-            batch_result = postprocess(batch_out[0], processed_results)
+            predictor_output = self._internal_predict(predictor, processed_results)
+            batch_result = postprocess(predictor_output, processed_results)
             results += batch_result
         return results
 
diff --git a/modules/text/text_review/porn_detection_lstm/processor.py b/modules/text/text_review/porn_detection_lstm/processor.py
index 1f6c8b565f53708f27735e51d6631015095f2cf6..c691dd6b38f4760d74ae9ad6b32d962f2589fd1c 100644
--- a/modules/text/text_review/porn_detection_lstm/processor.py
+++ b/modules/text/text_review/porn_detection_lstm/processor.py
@@ -52,7 +52,7 @@ def postprocess(predict_out, texts):
     Convert model's output tensor to pornography label
     """
     result = []
-    predict_out = predict_out.as_ndarray()
+    predict_out = predict_out.copy_to_cpu()
     for index in range(len(texts)):
         result_i = {}
         result_i['text'] = texts[index]['origin']
diff --git a/paddlehub/commands/run.py b/paddlehub/commands/run.py
index d806ce167f38dd9b5e21eb512be4b5ceed81060b..5a49ff117b5d9f248d8dc8a0dc48cfbd10ef4db4 100644
--- a/paddlehub/commands/run.py
+++ b/paddlehub/commands/run.py
@@ -68,6 +68,10 @@ class RunCommand:
         arg_config_group.add_argument(
             '--use_gpu', type=ast.literal_eval, default=False, help='whether use GPU for prediction')
         arg_config_group.add_argument('--batch_size', type=int, default=1, help='batch size for prediction')
+        arg_config_group.add_argument(
+            '--use_device',
+            choices=["cpu", "gpu", "xpu", "npu"],
+            help="use cpu, gpu, xpu or npu. overwrites use_gpu flag.")
 
         module_type = module.type.lower()
         if module_type.startswith('cv'):
@@ -83,4 +87,8 @@ class RunCommand:
         input_data = {key: [args.input_path] if module_type.startswith('cv') else [args.input_text]}
 
         return module(
-            sign_name=module.default_signature, data=input_data, use_gpu=args.use_gpu, batch_size=args.batch_size)
+            sign_name=module.default_signature,
+            data=input_data,
+            use_gpu=args.use_gpu,
+            batch_size=args.batch_size,
+            use_device=args.use_device)
diff --git a/paddlehub/compat/module/module_v1.py b/paddlehub/compat/module/module_v1.py
index 2e9b72b92c307d2132036686905cff2e53443d18..99e2c12b0d5592dc605311a6a38a963611b54607 100644
--- a/paddlehub/compat/module/module_v1.py
+++ b/paddlehub/compat/module/module_v1.py
@@ -167,7 +167,13 @@ class ModuleV1(object):
             program.global_block().var(feed_dict[tensor_name].name).desc.set_shape(seq_tensor_shape)
 
     @paddle_utils.run_in_static_mode
-    def __call__(self, sign_name: str, data: dict, use_gpu: bool = False, batch_size: int = 1, **kwargs):
+    def __call__(self,
+                 sign_name: str,
+                 data: dict,
+                 use_gpu: bool = False,
+                 batch_size: int = 1,
+                 use_device: str = None,
+                 **kwargs):
         '''Call the specified signature function for prediction.'''
 
         def _get_reader_and_feeder(data_format, data, place):
@@ -188,7 +194,18 @@ class ModuleV1(object):
         with paddle.static.program_guard(program):
             result = []
             index = 0
-            place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+
+            if use_device is not None:
+                if use_device == "xpu":
+                    place = paddle.XPUPlace(0)
+                elif use_device == "npu":
+                    place = paddle.NPUPlace(0)
+                elif use_device == "gpu":
+                    place = paddle.CUDAPlace(0)
+                else:
+                    place = paddle.CPUPlace()
+            else:
+                place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
 
             exe = paddle.static.Executor(place=place)
             data = self.processor.preprocess(sign_name=sign_name, data_dict=data)
diff --git a/paddlehub/compat/module/nlp_module.py b/paddlehub/compat/module/nlp_module.py
index d7209774fd94ff2e6592b6aba9f9a18d81750933..80c61efa0fdde97822e46b061de12e7627220e96 100644
--- a/paddlehub/compat/module/nlp_module.py
+++ b/paddlehub/compat/module/nlp_module.py
@@ -31,6 +31,9 @@ from paddlehub.module.module import runnable, RunModule
 from paddlehub.utils.parser import txt_parser
 from paddlehub.utils.utils import sys_stdin_encoding
 
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
 
 class DataFormatError(Exception):
     def __init__(self, *args):
@@ -48,24 +51,53 @@ class NLPBaseModule(RunModule):
 
 
 class NLPPredictionModule(NLPBaseModule):
+    def _get_device_id(self, places):
+        try:
+            places = os.environ[places]
+            id = int(places)
+        except:
+            id = -1
+        return id
+
     def _set_config(self):
-        '''predictor config setting'''
-        cpu_config = paddle.fluid.core.AnalysisConfig(self.pretrained_model_path)
+        """
+        predictor config setting
+        """
+
+        # create default cpu predictor
+        cpu_config = Config(self.pretrained_model_path)
         cpu_config.disable_glog_info()
         cpu_config.disable_gpu()
-        self.cpu_predictor = paddle.fluid.core.create_paddle_predictor(cpu_config)
-
-        try:
-            _places = os.environ['CUDA_VISIBLE_DEVICES']
-            int(_places[0])
-            use_gpu = True
-        except:
-            use_gpu = False
-        if use_gpu:
-            gpu_config = paddle.fluid.core.AnalysisConfig(self.pretrained_model_path)
+        self.cpu_predictor = create_predictor(cpu_config)
+
+        # create predictors using various types of devices
+
+        # npu
+        npu_id = self._get_device_id("FLAGS_selected_npus")
+        if npu_id != -1:
+            # use npu
+            npu_config = Config(self.pretrained_model_path)
+            npu_config.disable_glog_info()
+            npu_config.enable_npu(device_id=npu_id)
+            self.npu_predictor = create_predictor(npu_config)
+
+        # gpu
+        gpu_id = self._get_device_id("CUDA_VISIBLE_DEVICES")
+        if gpu_id != -1:
+            # use gpu
+            gpu_config = Config(self.pretrained_model_path)
             gpu_config.disable_glog_info()
-            gpu_config.enable_use_gpu(memory_pool_init_size_mb=500, device_id=0)
-            self.gpu_predictor = paddle.fluid.core.create_paddle_predictor(gpu_config)
+            gpu_config.enable_use_gpu(memory_pool_init_size_mb=500, device_id=gpu_id)
+            self.gpu_predictor = create_predictor(gpu_config)
+
+        # xpu
+        xpu_id = self._get_device_id("XPU_VISIBLE_DEVICES")
+        if xpu_id != -1:
+            # use xpu
+            xpu_config = Config(self.pretrained_model_path)
+            xpu_config.disable_glog_info()
+            xpu_config.enable_xpu(100)
+            self.xpu_predictor = create_predictor(xpu_config)
 
     def texts2tensor(self, texts: List[dict]) -> paddle.Tensor:
         '''
@@ -87,6 +119,29 @@ class NLPPredictionModule(NLPBaseModule):
         tensor.shape = [lod[-1], 1]
         return tensor
 
+    def _internal_predict(self, predictor, texts):
+        lod = [0]
+        data = []
+        for i, text in enumerate(texts):
+            data += text['processed']
+            lod.append(len(text['processed']) + lod[i])
+
+        # get predictor tensor
+        input_names = predictor.get_input_names()
+        input_tensor = predictor.get_input_handle(input_names[0])
+
+        # set data, shape and lod
+        input_tensor.copy_from_cpu(np.array(data).astype('int64'))
+        input_tensor.reshape([lod[-1], 1])
+        input_tensor.set_lod([lod])
+
+        # real predict
+        predictor.run()
+        output_names = predictor.get_output_names()
+        output_handle = predictor.get_output_handle(output_names[0])
+
+        return output_handle
+
     def to_unicode(self, texts: str) -> Text:
         '''
         Convert each element's type(str) of texts(list) to unicode in python2.7
@@ -129,7 +184,8 @@ class NLPPredictionModule(NLPBaseModule):
             self.parser.print_help()
             return None
 
-        results = self.predict(texts=input_data, use_gpu=args.use_gpu, batch_size=args.batch_size)
+        results = self.predict(
+            texts=input_data, use_gpu=args.use_gpu, batch_size=args.batch_size, use_device=args.use_device)
 
         return results
 
@@ -139,6 +195,10 @@ class NLPPredictionModule(NLPBaseModule):
             '--use_gpu', type=ast.literal_eval, default=False, help='whether use GPU for prediction')
 
         self.arg_config_group.add_argument('--batch_size', type=int, default=1, help='batch size for prediction')
+        self.arg_config_group.add_argument(
+            '--use_device',
+            choices=["cpu", "gpu", "xpu", "npu"],
+            help="use cpu, gpu, xpu or npu. overwrites use_gpu flag.")
 
     def add_module_input_arg(self):
         '''Add the command input options'''