ernie reorganize

cc785f8d · chenxuyi · 507e0916 · cc785f8d · cc785f8d · cc785f8d
80 changed file
--- a/.gitignore
+++ b/.gitignore
 *.pyc
 *.un~
 *.swp
+*.egg-info/
--- a/distill/README.md
+++ b/distill/README.md
@@ -50,11 +50,11 @@ sh ./distill/script/distill_chnsenticorp.sh

 该脚本会进行前述的三步：1. 在任务数据上Fine-tune。 2. 加载Fine-tune好的模型对增强数据进行打分。 3.使用Student模型进行训练。脚本采用hard-label蒸馏，在第二步中将会直接预测出ERNIE标注的label。

-该脚本涉及两个python文件:`./distill/finetune_chnsenticorp.py` 负责finetune以及预测teacher模型， `distill/distill_chnsentocorp.py` 负责student模型的训练。事先构造好的增强数据放在`${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug`
+该脚本涉及两个python文件:`./example/finetune_classifier.py` 负责finetune以及预测teacher模型， `distill/distill_chnsentocorp.py` 负责student模型的训练。事先构造好的增强数据放在`${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug`

 在脚本的第二步中，使用 `--do_predict` 参数进入预测模式:
 ```script
-cat ${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug/part.0 |python3 -u ./distill/finetune_chnsenticorp.py \
+cat ${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug/part.0 |python3 -u ./example/finetune_classifier.py \
    --do_predict \
    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/teacher \
    --warm_start_from ${MODEL_PATH}/params \
@@ -86,7 +86,7 @@ sh ./distill/script/distill_chnsenticorp_with_propeller_server.sh

 流程包含3步：1. finetune ERNIE模型。2. 取指标最好的ERNIE模型启动`propeller`服务。 3.在student模型的训练过程中访问服务获取teacher模型的标注。

-此流程涉及两个python文件: `distill/finetune_chnsenticorp.py` 与 `distill/distill_chnsentocorp_with_propeller_server.py`  。其中第一步与离线蒸馏中的用法完全一样。
+此流程涉及两个python文件: `example/finetune_classifier.py` 与 `distill/distill_chnsentocorp_with_propeller_server.py`  。其中第一步与离线蒸馏中的用法完全一样。
 第二步中使用
 ```script
 python3 -m propeller.tools.start_server -p 8113 -m ${teacher_dir}/best/inference/ &

--- a/distill/distill_chnsentocorp.py
+++ b/distill/distill_chnsentocorp.py
@@ -117,7 +117,7 @@ class ClassificationBowModel(propeller.train.Model):
        return {'acc': acc}

 if __name__ == '__main__':
-    parser = propeller.ArgumentParser('DAN model with Paddle')
+    parser = propeller.ArgumentParser('Distill model with Paddle')
    parser.add_argument('--max_seqlen', type=int, default=128)
    parser.add_argument('--vocab_file', type=str, required=True)
    parser.add_argument('--unsupervise_data_dir', type=str, required=True)

--- a/distill/distill_chnsentocorp_with_propeller_server.py
+++ b/distill/distill_chnsentocorp_with_propeller_server.py
@@ -118,7 +118,7 @@ class ClassificationBowModel(propeller.train.Model):
        return {'acc': acc}

 if __name__ == '__main__':
-    parser = propeller.ArgumentParser('DAN model with Paddle')
+    parser = propeller.ArgumentParser('distill model with ERNIE')
    parser.add_argument('--max_seqlen', type=int, default=128)
    parser.add_argument('--vocab_file', type=str, required=True)
    parser.add_argument('--teacher_vocab_file', type=str, required=True)

--- a/distill/finetune_chnsenticorp.py
+++ b/distill/finetune_chnsenticorp.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-import time
-import logging
-from random import random
-from functools import reduce, partial
-
-import numpy as np
-import multiprocessing
-
-import paddle
-import paddle.fluid as F
-import paddle.fluid.layers as L
-
-from model.ernie import ErnieModel
-from optimization import optimization
-import utils.data
-
-from propeller import log
-import propeller.paddle as propeller
-log.setLevel(logging.DEBUG)
-
-class ClassificationErnieModel(propeller.train.Model):
-    """propeller Model wraper for paddle-ERNIE """
-    def __init__(self, hparam, mode, run_config):
-        self.hparam = hparam
-        self.mode = mode
-        self.run_config = run_config
-
-    def forward(self, features):
-        src_ids, sent_ids = features
-        dtype = 'float16' if self.hparam['fp16'] else 'float32'
-        zero = L.fill_constant([1], dtype='int64', value=0)
-        input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), dtype) # assume pad id == 0
-        #input_mask = L.unsqueeze(input_mask, axes=[2])
-        d_shape = L.shape(src_ids)
-        seqlen = d_shape[1]
-        batch_size = d_shape[0]
-        pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
-        pos_ids = L.expand(pos_ids, [batch_size, 1])
-        pos_ids = L.unsqueeze(pos_ids, axes=[2])
-        pos_ids = L.cast(pos_ids, 'int64')
-        pos_ids.stop_gradient = True
-        input_mask.stop_gradient = True
-        task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment
-        task_ids.stop_gradient = True
-
-        bert = ErnieModel(
-            src_ids=src_ids,
-            position_ids=pos_ids,
-            sentence_ids=sent_ids,
-            task_ids=task_ids,
-            input_mask=input_mask,
-            config=self.hparam,
-            use_fp16=self.hparam['fp16']
-        )
-
-        cls_feats = bert.get_pooled_output()
-
-        cls_feats = L.dropout(
-            x=cls_feats,
-            dropout_prob=0.1,
-            dropout_implementation="upscale_in_train"
-        )
-
-        logits = L.fc(
-            input=cls_feats,
-            size=self.hparam['num_label'],
-            param_attr=F.ParamAttr(
-                name="cls_out_w",
-                initializer=F.initializer.TruncatedNormal(scale=0.02)),
-            bias_attr=F.ParamAttr(
-                name="cls_out_b", initializer=F.initializer.Constant(0.))
-        )
-
-        propeller.summary.histogram('pred', logits)
-
-        if self.mode is propeller.RunMode.PREDICT:
-            probs = L.softmax(logits)
-            return probs
-        else:
-            return logits
-
-    def loss(self, predictions, labels):
-        ce_loss, probs = L.softmax_with_cross_entropy(
-            logits=predictions, label=labels, return_softmax=True)
-        #L.Print(ce_loss, message='per_example_loss')
-        loss = L.mean(x=ce_loss)
-        return loss
-
-    def backward(self, loss):
-        scheduled_lr, _ = optimization(
-            loss=loss,
-            warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']),
-            num_train_steps=self.run_config.max_steps,
-            learning_rate=self.hparam['learning_rate'],
-            train_program=F.default_main_program(), 
-            startup_prog=F.default_startup_program(),
-            weight_decay=self.hparam['weight_decay'],
-            scheduler="linear_warmup_decay",)
-        propeller.summary.scalar('lr', scheduled_lr)
-
-    def metrics(self, predictions, label):
-        predictions = L.argmax(predictions, axis=1)
-        predictions = L.unsqueeze(predictions, axes=[1])
-        acc = propeller.metrics.Acc(label, predictions)
-        #auc = propeller.metrics.Auc(label, predictions)
-        return {'acc': acc}
-
-
-if __name__ == '__main__':
-    parser = propeller.ArgumentParser('DAN model with Paddle')
-    parser.add_argument('--max_seqlen', type=int, default=128)
-    parser.add_argument('--data_dir', type=str, required=True)
-    parser.add_argument('--vocab_file', type=str, required=True)
-    parser.add_argument('--do_predict', action='store_true')
-    parser.add_argument('--warm_start_from', type=str)
-    args = parser.parse_args()
-    run_config = propeller.parse_runconfig(args)
-    hparams = propeller.parse_hparam(args)
-
-
-    vocab = {j.strip().split(b'\t')[0].decode('utf8'): i for i, j in enumerate(open(args.vocab_file, 'rb'))}
-    sep_id = vocab['[SEP]']
-    cls_id = vocab['[CLS]']
-    unk_id = vocab['[UNK]']
-
-    tokenizer = utils.data.CharTokenizer(vocab.keys())
-
-    def tokenizer_func(inputs):
-        '''avoid pickle error'''
-        ret = tokenizer(inputs)
-        return ret
-
-    if not args.do_predict:
-        feature_column = propeller.data.FeatureColumns([
-            propeller.data.TextColumn('title',unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
-            propeller.data.LabelColumn('label'),
-        ])
-
-        def before(seg_a, label):
-            sentence, segments = utils.data.build_1_pair(seg_a, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
-            return sentence, segments, label
-
-        def after(sentence, segments, label):
-            sentence, segments, label = utils.data.expand_dims(sentence, segments, label)
-            return sentence, segments, label
-
-        log.debug(os.path.join(args.data_dir, 'train'))
-        train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \
-                                       .map(before) \
-                                       .padded_batch(hparams.batch_size, (0, 0, 0)) \
-                                       .map(after) 
-
-        dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
-                                       .map(before) \
-                                       .padded_batch(hparams.batch_size, (0, 0, 0)) \
-                                       .map(after) 
-
-
-        shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1])
-        types = ('int64', 'int64', 'int64')
-
-        train_ds.data_shapes = shapes
-        train_ds.data_types = types
-        dev_ds.data_shapes = shapes
-        dev_ds.data_types = types
-
-        varname_to_warmstart = re.compile('encoder.*|pooled.*|.*embedding|pre_encoder_.*')
-        warm_start_dir = args.warm_start_from
-        ws = propeller.WarmStartSetting(
-                predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)),
-                from_dir=warm_start_dir
-            )
-
-        best_exporter = propeller.train.exporter.BestInferenceModelExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc'])
-        propeller.train.train_and_eval(
-                model_class_or_model_fn=ClassificationErnieModel, 
-                params=hparams, 
-                run_config=run_config, 
-                train_dataset=train_ds, 
-                eval_dataset=dev_ds,
-                warm_start_setting=ws, 
-                exporters=[best_exporter])
-        print('dev_acc\t%.5f' % (best_exporter._best['eval']['acc']))
-    else:
-        feature_column = propeller.data.FeatureColumns([
-            propeller.data.TextColumn('title',unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
-            propeller.data.LabelColumn('label'),
-        ])
-        def before(seg_a):
-            sentence, segments = utils.data.build_1_pair(seg_a, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
-            return sentence, segments
-        def after(sentence, segments):
-            sentence, segments = utils.data.expand_dims(sentence, segments)
-            return sentence, segments
-        predict_ds = feature_column.build_dataset_from_stdin('predict') \
-                               .map(before) \
-                               .padded_batch(hparams.batch_size, (0, 0)) \
-                               .map(after) 
-        shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1])
-        types = ('int64', 'int64')
-
-        predict_ds.data_shapes = shapes
-        predict_ds.data_types = types
-        finetuned_model = propeller.Learner(ClassificationErnieModel, run_config, hparams)
-        for logits, in finetuned_model.predict(predict_ds, ckpt=-1): # ckpt=-1 means last step
-            print(np.argmax(logits))
-
--- a/distill/script/distill_chnsenticorp.sh
+++ b/distill/script/distill_chnsenticorp.sh
 set -x
-export PYTHONPATH=.:$PYTHONPATH
+export PYTHONPATH=.:./ernie/:${PYTHONPATH:-}
 output_dir=./output/distill
 teacher_dir=${output_dir}/teacher
 student_dir=${output_dir}/student

 # 1. finetune teacher
 CUDA_VISIBLE_DEVICES=0 \
-python3 -u ./distill/finetune_chnsenticorp.py \
+python3 -u ./example/finetune_classifier.py \
    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/teacher \
    --warm_start_from ${MODEL_PATH}/params \
    --vocab_file ${MODEL_PATH}/vocab.txt \
@@ -29,7 +29,7 @@ python3 -u ./distill/finetune_chnsenticorp.py \
    --hparam '{ # learn					    
      "warmup_proportion":  0.1,
      "weight_decay": 0.01,
-      "fp16": 0,
+      "use_fp16": 0,
      "learning_rate": 0.00005,
      "num_label": 2,
      "batch_size": 32 
@@ -39,7 +39,7 @@ python3 -u ./distill/finetune_chnsenticorp.py \

 # 2. start a prediction server
 export CUDA_VISIBLE_DEVICES=0
-cat ${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug/part.0 |awk -F"\t" '{print $2}' |python3 -u ./distill/finetune_chnsenticorp.py \
+cat ${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug/part.0 |awk -F"\t" '{print $2}' |python3 -u ./example/finetune_classifier.py \
    --do_predict \
    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/teacher \
    --warm_start_from ${MODEL_PATH}/params \
@@ -58,7 +58,7 @@ cat ${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug/part.0 |awk -
    --hparam '{ # learn
      "warmup_proportion":  0.1,
      "weight_decay": 0.01,
-      "fp16": 0,
+      "use_fp16": 0,
      "learning_rate": 0.00005,
      "num_label": 2,
      "batch_size": 100 
@@ -94,7 +94,6 @@ python3 ./distill/distill_chnsentocorp.py \
    --hparam '{  					     # lr shit
      "warmup_proportion":  0.1,
      "weight_decay": 0.00,
-      "fp16": 0,
      "learning_rate": 1e-4,
      "batch_size": 100 
    }' 

--- a/distill/script/distill_chnsenticorp_with_propeller_server.sh
+++ b/distill/script/distill_chnsenticorp_with_propeller_server.sh
 set -x
-export PYTHONPATH=.:$PYTHONPATH
+export PYTHONPATH=.:./ernie/:${PYTHONPATH:-}
 output_dir=./output/distill
 teacher_dir=${output_dir}/teacher
 student_dir=${output_dir}/student

 # 1. finetune teacher
 CUDA_VISIBLE_DEVICES=0 \
-python3 -u ./distill/finetune_chnsenticorp.py \
+python3 -u ./example/finetune_classifier.py  \
    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/teacher \
    --warm_start_from ${MODEL_PATH}/params \
    --vocab_file ${MODEL_PATH}/vocab.txt \
@@ -29,7 +29,7 @@ python3 -u ./distill/finetune_chnsenticorp.py \
    --hparam '{ # learn
      "warmup_proportion":  0.1,
      "weight_decay": 0.01,
-      "fp16": 0,
+      "use_fp16": 0,
      "learning_rate": 0.00005,
      "num_label": 2,
      "batch_size": 32 
@@ -74,7 +74,6 @@ python3 ./distill/distill_chnsentocorp_with_propeller_server.py \
    --hparam '{ # learn  					    
      "warmup_proportion":  0.1,
      "weight_decay": 0.00,
-      "fp16": 0,
      "learning_rate": 1e-4,
      "batch_size": 100 
    }' 

--- a/_ce.py
+++ b/_ce.py
--- a/batching.py
+++ b/batching.py
--- a/ernie_encoder.py
+++ b/ernie_encoder.py
--- a/__init__.py
+++ b/__init__.py
--- a/finetune/classifier.py
+++ b/finetune/classifier.py
--- a/finetune/mrc.py
+++ b/finetune/mrc.py
--- a/finetune/sequence_label.py
+++ b/finetune/sequence_label.py
--- a/finetune_args.py
+++ b/finetune_args.py
--- a/finetune_launch.py
+++ b/finetune_launch.py
--- a/infer_classifyer.py
+++ b/infer_classifyer.py
--- a/finetune/__init__.py
+++ b/finetune/__init__.py
--- a/model/ernie.py
+++ b/model/ernie.py
--- a/model/ernie_v1.py
+++ b/model/ernie_v1.py
--- a/model/transformer_encoder.py
+++ b/model/transformer_encoder.py
--- a/optimization.py
+++ b/optimization.py
--- a/pretrain_args.py
+++ b/pretrain_args.py
--- a/pretrain_launch.py
+++ b/pretrain_launch.py
--- a/model/__init__.py
+++ b/model/__init__.py
--- a/reader/pretraining.py
+++ b/reader/pretraining.py
--- a/reader/task_reader.py
+++ b/reader/task_reader.py
--- a/run_classifier.py
+++ b/run_classifier.py
--- a/run_mrc.py
+++ b/run_mrc.py
--- a/run_sequence_labeling.py
+++ b/run_sequence_labeling.py
--- a/ernie/service/client.py
+++ b/ernie/service/client.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+import os
+import argparse
+from propeller.service.client import InferenceClient
+from propeller import log
+import six
+import utils.data
+from time import time
+import numpy as np
+
+class ErnieClient(InferenceClient):
+    def __init__(self, 
+            vocab_file, 
+            host='localhost', 
+            port=8888, 
+            batch_size=32, 
+            num_coroutine=1, 
+            timeout=10., 
+            max_seqlen=128):
+        host_port = 'tcp://%s:%d' % (host, port) 
+        client = super(ErnieClient, self).__init__(host_port, batch_size=batch_size, num_coroutine=num_coroutine, timeout=timeout)
+        self.vocab = {j.strip().split(b'\t')[0].decode('utf8'): i for i, j in enumerate(open(vocab_file, 'rb'))}
+        self.tokenizer = utils.data.CharTokenizer(self.vocab.keys())
+        self.max_seqlen = max_seqlen
+        self.cls_id = self.vocab['[CLS]']
+        self.sep_id = self.vocab['[SEP]']
+
+    def txt_2_id(self, text):
+        ids = np.array([self.vocab[i] for i in self.tokenizer(text)])
+        return ids
+
+    def pad_and_batch(self, ids):
+        max_len = max(map(len, ids))
+        padded = np.stack([np.pad(i, [[0, max_len - len(i)]], mode='constant')for i in ids])
+        padded = np.expand_dims(padded, axis=-1)
+        return padded
+
+    def __call__(self, text_a, text_b=None):
+        if text_b is not None and len(text_a) != len(text_b):
+            raise ValueError('text_b %d has different size than text_a %d' % (text_b, text_a))
+        text_a = [i.encode('utf8') if isinstance(i, six.string_types) else i for i in text_a]
+        if text_b is not None:
+            text_b = [i.encode('utf8') if isinstance(i, six.string_types) else i for i in text_b]
+
+        ids_a = map(self.txt_2_id, text_a)
+        if text_b is not None:
+            ids_b = map(self.txt_2_id, text_b)
+            ret = [utils.data.build_2_pair(a, b, self.max_seqlen, self.cls_id, self.sep_id) for a, b in zip(ids_a, ids_b)]
+        else:
+            ret = [utils.data.build_1_pair(a, self.max_seqlen, self.cls_id, self.sep_id) for a in ids_a]
+        sen_ids, token_type_ids = zip(*ret)
+        sen_ids = self.pad_and_batch(sen_ids)
+        token_type_ids = self.pad_and_batch(token_type_ids)
+        ret, = super(ErnieClient, self).__call__(sen_ids, token_type_ids)
+        return ret
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ernie_encoder_client')
+    parser.add_argument('--host', type=str, default='localhost')
+    parser.add_argument('-i', '--input', type=str, required=True)
+    parser.add_argument('-o', '--output', type=str, required=True)
+    parser.add_argument('-p', '--port', type=int, default=8888)
+    parser.add_argument('--batch_size', type=int, default=32)
+    parser.add_argument('--num_coroutine', type=int, default=1)
+    parser.add_argument('--vocab', type=str, required=True)
+    args = parser.parse_args()
+
+    client = ErnieClient(args.vocab, args.host, args.port, batch_size=args.batch_size, num_coroutine=args.num_coroutine)
+    inputs = [i.strip().split(b'\t') for i in open(args.input, 'rb').readlines()]
+    if len(inputs) == 0:
+        raise ValueError('empty input')
+    send_batch = args.num_coroutine * args.batch_size
+    send_num = len(inputs) // send_batch + 1
+    rets = []
+    start = time()
+    for i in range(send_num):
+        slice = inputs[i * send_batch: (i + 1) * send_batch]
+        if len(slice) == 0:
+            continue
+        columns = list(zip(*slice))
+        if len(columns) > 2:
+            raise ValueError('inputs file has more than 2 columns')
+        ret = client(*columns)
+        if len(ret.shape) == 3:
+            ret = ret[:, 0, :] # take cls
+        rets.append(ret)
+    end = time()
+    with open(args.output, 'wb') as outf:
+        arr = np.concatenate(rets, 0)
+        np.save(outf, arr)
+        log.info('query num: %d average latency %.5f' % (len(inputs), (end - start)/len(inputs)))
+
--- a/ernie/service/encoder_server.py
+++ b/ernie/service/encoder_server.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+import os
+import argparse
+import logging
+import logging.handlers
+import re
+from propeller.service.server import InferenceServer
+from propeller import log
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m', '--model_dir', type=str, required=True)
+    parser.add_argument('-p', '--port', type=int, default=8888)
+    parser.add_argument('-v', '--verbose', action='store_true')
+    parser.add_argument('--encode_layer', type=str, choices=[
+        'pooler', 
+        'layer12', 
+        'layer11',
+        'layer10',
+        'layer9',
+        'layer8',
+        'layer7',
+        'layer6',
+        'layer5',
+        'layer4',
+        'layer3',
+        'layer2',
+        'layer1',
+        ], default='pooler')
+    args = parser.parse_args()
+
+    if args.verbose:
+        log.setLevel(logging.DEBUG)
+    cuda_env = os.getenv("CUDA_VISIBLE_DEVICES")
+    if cuda_env is None:
+        raise RuntimeError('CUDA_VISIBLE_DEVICES not set')
+    n_devices = len(cuda_env.split(","))
+    if args.encode_layer.lower() == 'pooler':
+        model_dir = os.path.join(args.model_dir, 'pooler')
+    else:
+        pat = re.compile(r'layer(\d+)')
+        match = pat.match(args.encode_layer.lower())
+        layer = int(match.group(1))
+        model_dir = os.path.join(args.model_dir, 'enc%d' % layer)
+
+    server = InferenceServer(model_dir, n_devices)
+    log.info('propeller server listent on port %d' % args.port)
+    server.listen(args.port)
--- a/tokenization.py
+++ b/tokenization.py
--- a/train.py
+++ b/train.py
--- a/reader/__init__.py
+++ b/reader/__init__.py
--- a/utils/args.py
+++ b/utils/args.py
--- a/utils/cards.py
+++ b/utils/cards.py
--- a/utils/cmrc2018_eval.py
+++ b/utils/cmrc2018_eval.py
--- a/utils/data.py
+++ b/utils/data.py
@@ -108,7 +108,7 @@ class CharTokenizer(object):
        """
        self.vocab = set(vocab)
        #self.pat = re.compile(r'([,.!?\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]|[\u4e00-\u9fa5]|[a-zA-Z0-9]+)')
-        self.pat = re.compile(r'\S')
+        self.pat =  re.compile(r'([a-zA-Z0-9]+|\S)')
        self.lower = lower

    def __call__(self, sen):
@@ -132,7 +132,7 @@ def build_2_pair(seg_a, seg_b, max_seqlen, cls_id, sep_id):

    seqlen = sen_emb.shape[0]
    #random truncate
-    random_begin = 0#np.random.randint(0, np.maximum(0, seqlen - max_seqlen) + 1,)
+    random_begin = 0 #np.random.randint(0, np.maximum(0, seqlen - max_seqlen) + 1,)
    sen_emb = sen_emb[random_begin: random_begin + max_seqlen]
    token_type_emb = token_type_emb[random_begin: random_begin + max_seqlen]

@@ -147,7 +147,7 @@ def build_1_pair(seg_a, max_seqlen, cls_id, sep_id):

    seqlen = sen_emb.shape[0]
    #random truncate
-    random_begin = 0#np.random.randint(0, np.maximum(0, seqlen - max_seqlen) + 1,)
+    random_begin = 0 #np.random.randint(0, np.maximum(0, seqlen - max_seqlen) + 1,)

    sen_emb = sen_emb[random_begin: random_begin + max_seqlen]
    token_type_emb = token_type_emb[random_begin: random_begin + max_seqlen]

--- a/utils/fp16.py
+++ b/utils/fp16.py
--- a/utils/init.py
+++ b/utils/init.py
--- a/propeller_xnli_demo.ipynb
+++ b/propeller_xnli_demo.ipynb
--- a/script/en_glue/ernie_base/CoLA/task.sh
+++ b/script/en_glue/ernie_base/CoLA/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_sync_nccl_allreduce=1
 export FLAGS_eager_delete_tensor_gb=0.0
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -20,7 +21,7 @@ batch_size=64
 epoch=3

 for i in {1..5};do
-python -u run_classifier.py                                                          \
+python -u ernie/run_classifier.py                                                          \
       --use_cuda true                                                               \
       --for_cn  False                                                               \
       --use_fast_executor ${e_executor:-"true"}                                     \

--- a/script/en_glue/ernie_base/MNLI/task.sh
+++ b/script/en_glue/ernie_base/MNLI/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -20,7 +21,7 @@ for i in {1..5};do

 timestamp=`date "+%Y-%m-%d-%H-%M-%S"`

-python -u run_classifier.py                                                             \
+python -u ./ernie/run_classifier.py                                                             \
       --use_cuda true                                                                  \
       --use_fast_executor ${e_executor:-"true"}                                        \
       --tokenizer ${TOKENIZER:-"FullTokenizer"}                                        \

--- a/script/en_glue/ernie_base/MRPC/task.sh
+++ b/script/en_glue/ernie_base/MRPC/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -19,7 +20,7 @@ epoch=4
 for i in {1..5};do

    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-    python -u run_classifier.py                                              \
+    python -u ./ernie/run_classifier.py                                              \
           --use_cuda true                                                   \
           --for_cn  False                                                   \
           --use_fast_executor ${e_executor:-"true"}                         \

--- a/script/en_glue/ernie_base/QNLI/task.sh
+++ b/script/en_glue/ernie_base/QNLI/task.sh
@@ -4,6 +4,7 @@ R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`

 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -22,7 +23,7 @@ for i in {1..5};do

    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`

-    python -u run_classifier.py                                                \
+    python -u ./ernie/run_classifier.py                                                \
           --use_cuda true                                                     \
           --for_cn False                                                      \
           --use_fast_executor ${e_executor:-"true"}                           \

--- a/script/en_glue/ernie_base/QQP/task.sh
+++ b/script/en_glue/ernie_base/QQP/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -21,7 +22,7 @@ for i in {1..1};do

  timestamp=`date "+%Y-%m-%d-%H-%M-%S"`

-  python -u run_classifier.py                                                      \
+  python -u ./ernie/run_classifier.py                                                      \
       --for_cn False                                                              \
       --ernie_config_path script/en_glue/ernie_base/ernie_config.json             \
       --validation_steps 1000000000000                                            \

--- a/script/en_glue/ernie_base/RTE/task.sh
+++ b/script/en_glue/ernie_base/RTE/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -16,7 +17,7 @@ for i in {1..5};do

    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`

-    python -u run_classifier.py                                                \
+    python -u ./ernie/run_classifier.py                                                \
               --use_cuda true                                                 \
               --for_cn False                                                  \
               --use_fast_executor ${e_executor:-"true"}                       \

--- a/script/en_glue/ernie_base/SST-2/task.sh
+++ b/script/en_glue/ernie_base/SST-2/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -18,7 +19,7 @@ epoch=4

 for i in {1..5};do

- python -u run_classifier.py                                                       \
+ python -u ./ernie/run_classifier.py                                                       \
      --for_cn  False                                                              \
      --use_cuda true                                                              \
      --use_fast_executor ${e_executor:-"true"}                                    \

--- a/script/en_glue/ernie_base/STS-B/task.sh
+++ b/script/en_glue/ernie_base/STS-B/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -18,7 +19,7 @@ epoch=3

 for i in {1..5};do

-python -u run_classifier.py                                                         \
+python -u ./ernie/run_classifier.py                                                         \
       --use_cuda true                                                              \
       --for_cn  False                                                              \
       --use_fast_executor ${e_executor:-"true"}                                    \

--- a/script/en_glue/ernie_base/WNLI/task.sh
+++ b/script/en_glue/ernie_base/WNLI/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -20,7 +21,7 @@ epoch=4

 for i in {1..5};do

-   python -u run_classifier.py                                            \
+   python -u ./ernie/run_classifier.py                                            \
       --for_cn False                                                     \
       --use_cuda true                                                    \
       --use_fast_executor ${e_executor:-"true"}                          \

--- a/script/en_glue/ernie_large/CoLA/task.sh
+++ b/script/en_glue/ernie_large/CoLA/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_sync_nccl_allreduce=1
 export FLAGS_eager_delete_tensor_gb=0.0
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -19,7 +20,7 @@ epoch=5
 for i in {1..5};do

    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-    python -u run_classifier.py                                              \
+    python -u ./ernie/run_classifier.py                                              \
           --use_cuda true                                                   \
           --for_cn  False                                                   \
           --use_fast_executor ${e_executor:-"true"}                         \

--- a/script/en_glue/ernie_large/MNLI/task.sh
+++ b/script/en_glue/ernie_large/MNLI/task.sh
@@ -4,6 +4,7 @@ R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`

 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -17,7 +18,7 @@ for i in {1..5};do

    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`

-    python -u run_classifier.py                                                             \
+    python -u ./ernie/run_classifier.py                                                             \
           --use_cuda true                                                                  \
           --use_fast_executor ${e_executor:-"true"}                                        \
           --tokenizer ${TOKENIZER:-"FullTokenizer"}                                        \

--- a/script/en_glue/ernie_large/MRPC/task.sh
+++ b/script/en_glue/ernie_large/MRPC/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -20,7 +21,7 @@ epoch=4
 for i in {1..5};do

    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-    python -u run_classifier.py                                              \
+    python -u ./ernie/run_classifier.py                                              \
           --use_cuda true                                                   \
           --for_cn  False                                                   \
           --use_fast_executor ${e_executor:-"true"}                         \

--- a/script/en_glue/ernie_large/QNLI/task.sh
+++ b/script/en_glue/ernie_large/QNLI/task.sh
@@ -4,6 +4,7 @@ R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`

 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -21,7 +22,7 @@ for i in {1..5};do

    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`

-    python -u run_classifier.py                                            \
+    python -u ./ernie/run_classifier.py                                            \
           --use_cuda true                                                 \
           --for_cn False                                                  \
           --use_fast_executor ${e_executor:-"true"}                       \

--- a/script/en_glue/ernie_large/QQP/task.sh
+++ b/script/en_glue/ernie_large/QQP/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -16,7 +17,7 @@ for i in {1..5};do

  timestamp=`date "+%Y-%m-%d-%H-%M-%S"`

-  python -u run_classifier.py                                                      \
+  python -u ./ernie/run_classifier.py                                                      \
       --for_cn False                                                              \
       --ernie_config_path script/en_glue/ernie_large/ernie_config.json            \
       --validation_steps 1000000000000                                            \

--- a/script/en_glue/ernie_large/RTE/task.sh
+++ b/script/en_glue/ernie_large/RTE/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -16,7 +17,7 @@ mkdir -p log/
 for i in {1..5};do
    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`

-    python -u run_classifier.py                                             \
+    python -u ./ernie/run_classifier.py                                             \
               --use_cuda true                                              \
               --for_cn False                                               \
               --use_fast_executor ${e_executor:-"true"}                    \

--- a/script/en_glue/ernie_large/SST-2/task.sh
+++ b/script/en_glue/ernie_large/SST-2/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 mkdir -p log/

@@ -19,7 +20,7 @@ epoch=4

 for i in {1..5};do

- python -u run_classifier.py                                          \
+ python -u ./ernie/run_classifier.py                                          \
      --for_cn  False                                                 \
      --use_cuda true                                                 \
      --use_fast_executor ${e_executor:-"true"}                       \

--- a/script/en_glue/ernie_large/STS-B/task.sh
+++ b/script/en_glue/ernie_large/STS-B/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -15,7 +16,7 @@ mkdir -p log/

 for i in {1..5};do

-python -u run_classifier.py                                             \
+python -u ./ernie/run_classifier.py                                             \
       --use_cuda true                                                  \
       --for_cn  False                                                  \
       --use_fast_executor ${e_executor:-"true"}                        \

--- a/script/en_glue/ernie_large/WNLI/task.sh
+++ b/script/en_glue/ernie_large/WNLI/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}

 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -19,7 +20,7 @@ epoch=4

 for i in {1..5};do

-python -u run_classifier.py                                                \
+python -u ./ernie/run_classifier.py                                                \
       --for_cn False                                                      \
       --use_cuda true                                                     \
       --use_fast_executor ${e_executor:-"true"}                           \

--- a/script/zh_task/ernie_base/run_ChnSentiCorp.sh
+++ b/script/zh_task/ernie_base/run_ChnSentiCorp.sh
@@ -4,7 +4,8 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0

-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_base/run_bq.sh
+++ b/script/zh_task/ernie_base/run_bq.sh
@@ -4,7 +4,8 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0

-python -u ./run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                     --use_cuda true \
                     --verbose true \
                     --do_train true \

--- a/script/zh_task/ernie_base/run_cmrc2018.sh
+++ b/script/zh_task/ernie_base/run_cmrc2018.sh
@@ -4,12 +4,13 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

-python ./finetune_launch.py  \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
    --nproc_per_node 8 \
    --selected_gpus 0,1,2,3,4,5,6,7 \
    --node_ips $(hostname -i) \
    --node_id 0 \
-run_mrc.py --use_cuda true\
+./ernie/run_mrc.py --use_cuda true\
                    --batch_size 16 \
                    --in_tokens false\
                    --use_fast_executor true \

--- a/script/zh_task/ernie_base/run_dbqa.sh
+++ b/script/zh_task/ernie_base/run_dbqa.sh
@@ -4,13 +4,13 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

-
-python ./finetune_launch.py  \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
    --nproc_per_node 8 \
    --selected_gpus 0,1,2,3,4,5,6,7 \
    --node_ips $(hostname -i) \
    --node_id 0 \
-run_classifier.py \
+./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_base/run_drcd.sh
+++ b/script/zh_task/ernie_base/run_drcd.sh
@@ -4,12 +4,13 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

-python ./finetune_launch.py  \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
    --nproc_per_node 8 \
    --selected_gpus 0,1,2,3,4,5,6,7 \
    --node_ips $(hostname -i) \
    --node_id 0 \
-run_mrc.py --use_cuda true\
+./ernie/run_mrc.py --use_cuda true\
                    --batch_size 16 \
                    --in_tokens false\
                    --use_fast_executor true \

--- a/script/zh_task/ernie_base/run_lcqmc.sh
+++ b/script/zh_task/ernie_base/run_lcqmc.sh
@@ -4,7 +4,8 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0

-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_base/run_msra_ner.sh
+++ b/script/zh_task/ernie_base/run_msra_ner.sh
@@ -4,7 +4,8 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0

-python -u run_sequence_labeling.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_sequence_labeling.py \
                   --use_cuda true \
                   --do_train true \
                   --do_val true \

--- a/script/zh_task/ernie_base/run_thuc.sh
+++ b/script/zh_task/ernie_base/run_thuc.sh
@@ -4,7 +4,8 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --do_train true \
                   --do_val true \

--- a/script/zh_task/ernie_base/run_xnli.sh
+++ b/script/zh_task/ernie_base/run_xnli.sh
@@ -4,12 +4,13 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

-python ./finetune_launch.py  \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
    --nproc_per_node 8 \
    --selected_gpus 0,1,2,3,4,5,6,7 \
    --node_ips $(hostname -i) \
    --node_id 0 \
-run_classifier.py \
+./ernie/run_classifier.py \
   --use_cuda true \
   --do_train true \
   --do_val true \

--- a/script/zh_task/ernie_large/run_ChnSentiCorp.sh
+++ b/script/zh_task/ernie_large/run_ChnSentiCorp.sh
@@ -3,7 +3,9 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0

-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_large/run_bq.sh
+++ b/script/zh_task/ernie_large/run_bq.sh
@@ -3,7 +3,8 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0

-python -u ./run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                     --use_cuda true \
                     --verbose true \
                     --do_train true \

--- a/script/zh_task/ernie_large/run_cmrc2018.sh
+++ b/script/zh_task/ernie_large/run_cmrc2018.sh
@@ -4,12 +4,13 @@ export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

-python ./finetune_launch.py  \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
    --nproc_per_node 8 \
    --selected_gpus 0,1,2,3,4,5,6,7 \
    --node_ips $(hostname -i) \
    --node_id 0 \
-run_mrc.py --use_cuda true\
+./ernie/run_mrc.py --use_cuda true\
                    --batch_size 8 \
                    --in_tokens false\
                    --use_fast_executor true \

--- a/script/zh_task/ernie_large/run_dbqa.sh
+++ b/script/zh_task/ernie_large/run_dbqa.sh
@@ -3,12 +3,13 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

-python ./finetune_launch.py  \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
    --nproc_per_node 8 \
    --selected_gpus 0,1,2,3,4,5,6,7 \
    --node_ips $(hostname -i) \
    --node_id 0 \
-run_classifier.py \
+./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_large/run_drcd.sh
+++ b/script/zh_task/ernie_large/run_drcd.sh
@@ -4,12 +4,13 @@ export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

-python ./finetune_launch.py  \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
    --nproc_per_node 8 \
    --selected_gpus 0,1,2,3,4,5,6,7 \
    --node_ips $(hostname -i) \
    --node_id 0 \
-run_mrc.py --use_cuda true\
+./ernie/run_mrc.py --use_cuda true\
                    --batch_size 8 \
                    --in_tokens false\
                    --use_fast_executor true \

--- a/script/zh_task/ernie_large/run_lcqmc.sh
+++ b/script/zh_task/ernie_large/run_lcqmc.sh
@@ -3,7 +3,8 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0

-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_large/run_msra_ner.sh
+++ b/script/zh_task/ernie_large/run_msra_ner.sh
@@ -3,7 +3,8 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0

-python -u run_sequence_labeling.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_sequence_labeling.py \
                   --use_cuda true \
                   --do_train true \
                   --do_val true \

--- a/script/zh_task/ernie_large/run_thuc.sh
+++ b/script/zh_task/ernie_large/run_thuc.sh
@@ -3,7 +3,8 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --do_train true \
                   --do_val true \

--- a/script/zh_task/ernie_large/run_xnli.sh
+++ b/script/zh_task/ernie_large/run_xnli.sh
@@ -3,13 +3,13 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

-
-python ./finetune_launch.py  \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
    --nproc_per_node 8 \
    --selected_gpus 0,1,2,3,4,5,6,7 \
    --node_ips $(hostname -i) \
    --node_id 0 \
-run_classifier.py \
+./ernie/run_classifier.py \
                   --use_cuda true \
                   --do_train true \
                   --do_val true \

--- a/script/zh_task/pretrain.sh
+++ b/script/zh_task/pretrain.sh
@@ -3,12 +3,12 @@ set -eux
 export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python ./pretrain_launch.py  \
+python ./ernie/pretrain_launch.py  \
    --nproc_per_node 8 \
    --selected_gpus 0,1,2,3,4,5,6,7 \
    --node_ips $(hostname -i) \
    --node_id 0 \
-./train.py --use_cuda True \
+./ernie/train.py --use_cuda True \
                --is_distributed False\
                --use_fast_executor True \
                --weight_sharing True \

--- a/utils/__init__.py
+++ b/utils/__init__.py