diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-base.yaml b/examples/iwslt2012/punc0/conf/ernie-3.0-base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..845b13fd80896e376ae7c839c735b1f4b73be05f
--- /dev/null
+++ b/examples/iwslt2012/punc0/conf/ernie-3.0-base.yaml
@@ -0,0 +1,44 @@
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+dataset_type: Ernie
+train_path: data/iwslt2012_zh/train.txt
+dev_path: data/iwslt2012_zh/dev.txt
+test_path: data/iwslt2012_zh/test.txt
+batch_size: 64
+num_workers: 2
+data_params: 
+    pretrained_token: ernie-3.0-base-zh
+    punc_path: data/iwslt2012_zh/punc_vocab
+    seq_len: 100
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model_type: ErnieLinear
+model:
+    pretrained_token: ernie-3.0-base-zh
+    num_classes: 4
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer_params:
+    weight_decay: 1.0e-6               # weight decay coefficient.
+
+scheduler_params:
+    learning_rate: 1.0e-5               # learning rate.
+    gamma: 0.9999                          # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 20
+num_snapshots: 5
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-medium.yaml b/examples/iwslt2012/punc0/conf/ernie-3.0-medium.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..392ba011c07d15f70e69e27b8bc128ac351b5543
--- /dev/null
+++ b/examples/iwslt2012/punc0/conf/ernie-3.0-medium.yaml
@@ -0,0 +1,44 @@
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+dataset_type: Ernie
+train_path: data/iwslt2012_zh/train.txt
+dev_path: data/iwslt2012_zh/dev.txt
+test_path: data/iwslt2012_zh/test.txt
+batch_size: 64
+num_workers: 2
+data_params: 
+    pretrained_token: ernie-3.0-medium-zh
+    punc_path: data/iwslt2012_zh/punc_vocab
+    seq_len: 100
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model_type: ErnieLinear
+model:
+    pretrained_token: ernie-3.0-medium-zh
+    num_classes: 4
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer_params:
+    weight_decay: 1.0e-6               # weight decay coefficient.
+
+scheduler_params:
+    learning_rate: 1.0e-5               # learning rate.
+    gamma: 0.9999                          # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 20
+num_snapshots: 5
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-mini.yaml b/examples/iwslt2012/punc0/conf/ernie-3.0-mini.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c57fd94a8f0c18bfe108b4cf5835725ae58a210e
--- /dev/null
+++ b/examples/iwslt2012/punc0/conf/ernie-3.0-mini.yaml
@@ -0,0 +1,44 @@
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+dataset_type: Ernie
+train_path: data/iwslt2012_zh/train.txt
+dev_path: data/iwslt2012_zh/dev.txt
+test_path: data/iwslt2012_zh/test.txt
+batch_size: 64
+num_workers: 2
+data_params: 
+    pretrained_token: ernie-3.0-mini-zh
+    punc_path: data/iwslt2012_zh/punc_vocab
+    seq_len: 100
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model_type: ErnieLinear
+model:
+    pretrained_token: ernie-3.0-mini-zh
+    num_classes: 4
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer_params:
+    weight_decay: 1.0e-6               # weight decay coefficient.
+
+scheduler_params:
+    learning_rate: 1.0e-5               # learning rate.
+    gamma: 0.9999                          # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 20
+num_snapshots: 5
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh.yaml b/examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a7a84c4c1361eef5a85d11880b172964a5e78220
--- /dev/null
+++ b/examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh.yaml
@@ -0,0 +1,44 @@
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+dataset_type: Ernie
+train_path: data/iwslt2012_zh/train.txt
+dev_path: data/iwslt2012_zh/dev.txt
+test_path: data/iwslt2012_zh/test.txt
+batch_size: 64
+num_workers: 2
+data_params: 
+    pretrained_token: ernie-3.0-nano-zh
+    punc_path: data/iwslt2012_zh/punc_vocab
+    seq_len: 100
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model_type: ErnieLinear
+model:
+    pretrained_token: ernie-3.0-nano-zh
+    num_classes: 4
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer_params:
+    weight_decay: 1.0e-6               # weight decay coefficient.
+
+scheduler_params:
+    learning_rate: 1.0e-5               # learning rate.
+    gamma: 0.9999                          # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 20
+num_snapshots: 5
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/iwslt2012/punc0/conf/ernie-tiny.yaml b/examples/iwslt2012/punc0/conf/ernie-tiny.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a5b7fee2599bc8eee13906688e3238f3ce4d704
--- /dev/null
+++ b/examples/iwslt2012/punc0/conf/ernie-tiny.yaml
@@ -0,0 +1,44 @@
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+dataset_type: Ernie
+train_path: data/iwslt2012_zh/train.txt
+dev_path: data/iwslt2012_zh/dev.txt
+test_path: data/iwslt2012_zh/test.txt
+batch_size: 64
+num_workers: 2
+data_params: 
+    pretrained_token: ernie-tiny
+    punc_path: data/iwslt2012_zh/punc_vocab
+    seq_len: 100
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model_type: ErnieLinear
+model:
+    pretrained_token: ernie-tiny
+    num_classes: 4
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer_params:
+    weight_decay: 1.0e-6               # weight decay coefficient.
+
+scheduler_params:
+    learning_rate: 1.0e-5               # learning rate.
+    gamma: 0.9999                          # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 20
+num_snapshots: 5
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random