diff --git a/PaddleNLP/pretrain_language_models/BERT/config/xpu_bert_config.yaml b/PaddleNLP/pretrain_language_models/BERT/config/xpu_bert_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0271695ad9c63092b91e9949247e8a4aad7568e
--- /dev/null
+++ b/PaddleNLP/pretrain_language_models/BERT/config/xpu_bert_config.yaml
@@ -0,0 +1,35 @@
+# task_name for train.
+task_name: "XNLI"
+#use cuda for train
+use_cuda: false
+#use xpu for train
+use_xpu: true
+# do train
+do_train: true
+#do val
+do_val: true
+#do test
+do_test: true
+#batch size 
+batch_size: 16
+#in_tokens
+in_tokens: false
+# init pretraining params for train
+init_pretraining_params: 'chinese_L-12_H-768_A-12/params'
+#xpu use data XNLI1.0
+data_dir: 'data/XNLI1.0/'
+#vocab_path
+vocab_path: 'chinese_L-12_H-768_A-12/vocab.txt'
+#checkpoints
+checkpoints: './save/checkpoints'
+save_steps: 100
+weight_decay: 0.01
+warmup_proportion: 0.1
+validation_steps: 100
+epoch: 1
+max_seq_len: 128
+learning_rate: 5e-5
+skip_steps: 10
+num_iteration_per_drop_scope: 10
+verbose: true
+bert_config_path: 'chinese_L-12_H-768_A-12/bert_config.json'
diff --git a/PaddleNLP/pretrain_language_models/BERT/run_classifier.py b/PaddleNLP/pretrain_language_models/BERT/run_classifier.py
index 4204307985c490df171bc41ddc580968036742df..2d1915b5653b9eb7ca09c9c192da928483e3a4a9 100644
--- a/PaddleNLP/pretrain_language_models/BERT/run_classifier.py
+++ b/PaddleNLP/pretrain_language_models/BERT/run_classifier.py
@@ -38,7 +38,7 @@ import reader.cls as reader
 from model.bert import BertConfig
 from model.classifier import create_model
 from optimization import optimization
-from utils.args import ArgumentGroup, print_arguments, check_cuda, check_version
+from utils.args import ArgumentGroup, print_arguments, check_cuda, check_xpu, check_version
 from utils.init import init_pretraining_params, init_checkpoint
 from utils.cards import get_cards
 import dist_utils
@@ -101,6 +101,7 @@ run_type_g.add_arg("is_profiler",                  int,    0,     "the profiler
 run_type_g.add_arg("max_iter",                     int,    0,     "the max batch nums to train. (used for benchmark)")
 
 run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use GPU for training.")
+run_type_g.add_arg("use_xpu",                      bool,   True,  "If set, use XPU for training.")
 run_type_g.add_arg("use_fast_executor",            bool,   False, "If set, use fast parallel executor (in experiment).")
 run_type_g.add_arg("shuffle",                      bool,   True,  "")
 run_type_g.add_arg("num_iteration_per_drop_scope", int,    1,     "Ihe iteration intervals to clean up temporary variables.")
@@ -148,10 +149,17 @@ def get_device_num():
 def main(args):
     bert_config = BertConfig(args.bert_config_path)
     bert_config.print_config()
+    
+    if args.use_xpu:
+        paddle.enable_static()
 
     if args.use_cuda:
         place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
         dev_count = get_device_num()
+    elif args.use_xpu:
+        xpu_id = int(os.getenv('FLAGS_selected_xpus', '0'))
+        place = fluid.XPUPlace(xpu_id)
+        dev_count = len([place])       
     else:
         place = fluid.CPUPlace()
         dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
@@ -311,8 +319,12 @@ def main(args):
             train_data_generator = fluid.contrib.reader.distributed_batch_reader(
                   train_data_generator)
 
-        train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel(
-                 loss_name=loss.name, build_strategy=build_strategy)
+        if args.use_xpu:
+            train_compiled_program = train_program
+        else:
+
+            train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel(
+                    loss_name=loss.name, build_strategy=build_strategy)
 
         train_data_loader.set_batch_generator(train_data_generator, place)
 
@@ -447,5 +459,6 @@ def main(args):
 if __name__ == '__main__':
     print_arguments(args)
     check_cuda(args.use_cuda)
+    check_xpu(args.use_xpu)
     check_version()
     main(args)
diff --git a/PaddleNLP/pretrain_language_models/BERT/utils/args.py b/PaddleNLP/pretrain_language_models/BERT/utils/args.py
index fb060d46a9cef41a7a28b5169d330f1eac2038b8..4f9a7bd6baf51a9235271b3b34643a22f3f94a1b 100644
--- a/PaddleNLP/pretrain_language_models/BERT/utils/args.py
+++ b/PaddleNLP/pretrain_language_models/BERT/utils/args.py
@@ -61,6 +61,16 @@ def check_cuda(use_cuda, err = \
     except Exception as e:
         pass
 
+def check_xpu(use_xpu, err = \
+    "\nYou can not set use_xpu = True in the model because you are using paddlepaddle-cpu or paddlepaddle-gpu.\n \
+    Please: 1. Install paddlepaddle-xpu to run your models on XPU or 2. Set use_xpu = False to run models on CPU.\n"
+                                                                                                                     ):
+    try:
+        if use_xpu == True and fluid.is_compiled_with_xpu() == False:
+            print(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
 
 def check_version():
     """