From a40cc12fae4cd1b6d5f07cd49aa1a092e76dbd0b Mon Sep 17 00:00:00 2001
From: wanghua <wanghua36@huawei.com>
Date: Sat, 22 Aug 2020 15:43:09 +0800
Subject: [PATCH] modify BERT and TinyBERT README.md

---
 model_zoo/official/README.md                  |   1 +
 model_zoo/official/nlp/bert/README.md         | 307 +++++++--
 model_zoo/official/nlp/bert/run_classifier.py |   2 +-
 .../run_distribute_pretrain.py                |   2 +-
 .../ms2tf_config.py                           | 636 ------------------
 .../ms_and_tf_checkpoint_transfer_tools.py    | 144 ----
 .../readme.md                                 |  22 -
 .../nlp/bert/scripts/run_classifier.sh        |   2 +-
 ....sh => run_distributed_pretrain_ascend.sh} |   4 +-
 ...sh => run_distributed_pretrain_for_gpu.sh} |   4 +-
 .../official/nlp/bert/scripts/run_ner.sh      |   2 +-
 .../official/nlp/bert/scripts/run_squad.sh    |   2 +-
 ...n.sh => run_standalone_pretrain_ascend.sh} |   2 +-
 model_zoo/official/nlp/tinybert/README.md     | 320 +++++++--
 ...ute_gd.sh => run_distributed_gd_ascend.sh} |   4 +-
 ...r_gpu.sh => run_distributed_gd_for_gpu.sh} |   4 +-
 ...lone_gd.sh => run_standalone_gd_ascend.sh} |   0
 ...lone_td.sh => run_standalone_td_ascend.sh} |   0
 18 files changed, 523 insertions(+), 935 deletions(-)
 delete mode 100644 model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/ms2tf_config.py
 delete mode 100644 model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/ms_and_tf_checkpoint_transfer_tools.py
 delete mode 100644 model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/readme.md
 rename model_zoo/official/nlp/bert/scripts/{run_distribute_pretrain.sh => run_distributed_pretrain_ascend.sh} (90%)
 rename model_zoo/official/nlp/bert/scripts/{run_distribute_pretrain_for_gpu.sh => run_distributed_pretrain_for_gpu.sh} (89%)
 rename model_zoo/official/nlp/bert/scripts/{run_standalone_pretrain.sh => run_standalone_pretrain_ascend.sh} (96%)
 rename model_zoo/official/nlp/tinybert/scripts/{run_distribute_gd.sh => run_distributed_gd_ascend.sh} (93%)
 rename model_zoo/official/nlp/tinybert/scripts/{run_distribute_gd_for_gpu.sh => run_distributed_gd_for_gpu.sh} (86%)
 rename model_zoo/official/nlp/tinybert/scripts/{run_standalone_gd.sh => run_standalone_gd_ascend.sh} (100%)
 rename model_zoo/official/nlp/tinybert/scripts/{run_standalone_td.sh => run_standalone_td_ascend.sh} (100%)

diff --git a/model_zoo/official/README.md b/model_zoo/official/README.md
index e921cfe3d..8aaf07071 100644
--- a/model_zoo/official/README.md
+++ b/model_zoo/official/README.md
@@ -40,6 +40,7 @@ In order to facilitate developers to enjoy the benefits of MindSpore framework,
 
     - [Natural Language Processing](#natural-language-processing)
         - [BERT[benchmark]](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/bert/README.md)
+        - [TinyBERT](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/tinybert/README.md)
         - [MASS](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/mass/README.md)
         - [Transformer](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/transformer/README.md)
     - [Recommendation](#recommendation)
diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md
index 160a87219..8325bb7c7 100644
--- a/model_zoo/official/nlp/bert/README.md
+++ b/model_zoo/official/nlp/bert/README.md
@@ -1,80 +1,135 @@
-# BERT Example
-## Description
-This example implements pre-training, fine-tuning and evaluation of [BERT-base](https://github.com/google-research/bert) and [BERT-NEZHA](https://github.com/huawei-noah/Pretrained-Language-Model).
-
-## Requirements
-- Install [MindSpore](https://www.mindspore.cn/install/en).
-- Download the zhwiki dataset for pre-training. Extract and clean text in the dataset with [WikiExtractor](https://github.com/attardi/wikiextractor). Convert the dataset to TFRecord format and move the files to a specified path.
-- Download dataset for fine-tuning and evaluation such as CLUENER, TNEWS, SQuAD v1.1, etc.
-- Convert dataset files from json format to tfrecord format, please refer to run_classifier.py which in [BERT](https://github.com/google-research/bert) repository.
->  Notes:
-   If you are running a fine-tuning or evaluation task, prepare a checkpoint from pre-train.
-
-## Running the Example
-### Pre-Training
-- Set options in `config.py`, including lossscale, optimizer and network. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file.
-
-- Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model on `Ascend`.
+# Contents
+- [BERT Description](#bert-description)
+- [Model Architecture](#model-architecture)
+- [Dataset](#dataset)
+- [Environment Requirements](#environment-requirements)
+- [Quick Start](#quick-start)
+- [Script Description](#script-description)
+    - [Script and Sample Code](#script-and-sample-code)
+    - [Script Parameters](#script-parameters)
+    - [Dataset Preparation](#dataset-preparation)
+    - [Training Process](#training-process)
+    - [Evaluation Process](#evaluation-process)
+- [Model Description](#model-description)
+    - [Performance](#performance)
+        - [Training Performance](#training-performance)
+        - [Evaluation Performance](#evaluation-performance)
+- [Description of Random Situation](#description-of-random-situation)
+- [ModelZoo Homepage](#modelzoo-homepage)
 
-    ``` bash   
-    bash scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
-    ```
+# [BERT Description](#contents)
+The BERT network was proposed by Google in 2018. The network has made a breakthrough in the field of NLP. The network uses pre-training and fine-tune to achieve a large network structure without modifying the large network structure, and only by adding an output layer to achieve multiple text types tasks. The backbone code of BERT adopts the Encoder structure of Transformer. The attention mechanism is introduced to enable the output layer to capture high-latitude global semantic information. The pre-training uses denosing and self-encoding tasks, namely MLM(Masked Language Model) and NSP(Next Sentence Prediction). No need to label data, pre-training can be performed on massive text data, and a small amount of fine-tune can be used for downstream tasks to obtain better results. The pre-training plus fune-tune mode created by BERT is widely adopted by subsequent NLP networks.
 
-- Run `run_standalone_pretrain_for_gpu.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model on `GPU`.
+[Paper](https://arxiv.org/abs/1810.04805):  Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova. [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding]((https://arxiv.org/abs/1810.04805)). arXiv preprint arXiv:1810.04805. 
 
-    ``` bash   
-    bash scripts/run_standalone_pretrain_for_gpu.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
-    ```
+[Paper](https://arxiv.org/abs/1909.00204):  Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen, Qun Liu. [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204). arXiv preprint arXiv:1909.00204.
 
-- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model on `Ascend`.
+# [Model Architecture](#contents)
+The backbone structure of BERT is transformer. For BERT_base, the transformer contains 12 encoder modules, one encoder contains one selfattention module and one selfattention module contains one attention module; For BERT_NEZHA, the transformer contains 24 encoder modules, one encoder contains one selfattention module and one selfattention module contains one attention module. The difference between BERT_base and BERT_NEZHA is that BERT_base uses absolute position encoding to produce position embedding vector and BERT_NEZHA uses relative position encoding.  
 
-    ``` bash   
-    bash scripts/run_distribute_pretrain.sh DATA_DIR RANK_TABLE_FILE
-    ```  
+# [Dataset](#contents)
+- Download the zhwiki or enwiki dataset for pre-training. Extract and refine texts in the dataset with [WikiExtractor](https://github.com/attardi/wikiextractor). Convert the dataset to TFRecord format. Please refer to create_pretraining_data.py file in [BERT](https://github.com/google-research/bert) repository.
+- Download dataset for fine-tuning and evaluation such as CLUENER, TNEWS, SQuAD v1.1, etc. Convert dataset files from json format to tfrecord format, please refer to run_classifier.py which in [BERT](https://github.com/google-research/bert) repository.
 
-- Run `run_distribute_pretrain_for_gpu.sh` for distributed pre-training of BERT-base and BERT-NEZHA model on `GPU`.
+# [Environment Requirements](#contents)
+- Hardware（Ascend）
+  - Prepare hardware environment with Ascend processor. If you want to try Ascend, please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get access to the resources. 
+- Framework
+  - [MindSpore](https://gitee.com/mindspore/mindspore)
+- For more information, please check the resources below：
+  - [MindSpore tutorials](https://www.mindspore.cn/tutorial/en/master/index.html) 
+  - [MindSpore API](https://www.mindspore.cn/api/en/master/index.html)
 
-    ```bash
-    bash scripts/run_distribute_pretrain_for_gpu.sh RANK_SIZE EPOCH_SIZE DATA_DIR SCHEMA_DIR
-    ```
+# [Quick Start](#contents)
+After installing MindSpore via the official website, you can start pre-training, fine-tuning and evaluation as follows:
+```bash
+# run standalone pre-training example
+bash scripts/run_standalone_pretrain_ascend.sh 0 1 /path/cn-wiki-128
 
-### Fine-Tuning and Evaluation
-- Including three kinds of task: Classification, NER(Named Entity Recognition) and SQuAD(Stanford Question Answering Dataset)
+# run distributed pre-training example
+bash scripts/run_distributed_pretrain_ascend.sh /path/cn-wiki-128 /path/hccl.json
 
+# run fine-tuning and evaluation example
+- If you are going to run a fine-tuning task, please prepare a checkpoint from pre-training.
 - Set bert network config and optimizer hyperparameters in `finetune_eval_config.py`. 
-
+    
 - Classification task: Set task related hyperparameters in scripts/run_classifier.sh. 
 - Run `bash scripts/run_classifier.py` for fine-tuning of BERT-base and BERT-NEZHA model.
 
-    ```bash
-    bash scripts/run_classifier.sh
-    ```
+  bash scripts/run_classifier.sh
   
 - NER task: Set task related hyperparameters in scripts/run_ner.sh.
 - Run `bash scripts/run_ner.py` for fine-tuning of BERT-base and BERT-NEZHA model.
 
-    ```bash
-    bash scripts/run_ner.sh
-    ```
-  
+  bash scripts/run_ner.sh
+      
 - SQuAD task: Set task related hyperparameters in scripts/run_squad.sh. 
 - Run `bash scripts/run_squad.py` for fine-tuning of BERT-base and BERT-NEZHA model.
 
-    ```bash
-    bash scripts/run_squad.sh
-    ```
+  bash scripts/run_squad.sh    
+```
+
+For distributed training, a hccl configuration file with JSON format needs to be created in advance.
+Please follow the instructions in the link below:
+https:gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools.
+
+# [Script Description](#contents)
+
+## [Script and Sample Code](#contents)
 
-## Usage
+```shell
+.
+└─bert
+  ├─README.md
+  ├─scripts
+    ├─ascend_distributed_launcher
+        ├─__init__.py
+        ├─hyper_parameter_config.ini          # hyper paramter for distributed pretraining 
+        ├─run_distribute_pretrain.py          # script for distributed pretraining
+        ├─README.md    
+    ├─run_classifier.sh                       # shell script for standalone classifier task
+    ├─run_ner.sh                              # shell script for standalone NER task
+    ├─run_squad.sh                            # shell script for standalone SQUAD task  
+    ├─run_standalone_pretrain_ascend.sh       # shell script for standalone pretrain on ascend
+    ├─run_distributed_pretrain_ascend.sh      # shell script for distributed pretrain on ascend
+    └─run_standaloned_pretrain_gpu.sh         # shell script for distributed pretrain on gpu
+  ├─src
+    ├─__init__.py
+    ├─assessment_method.py                    # assessment method for evaluation
+    ├─bert_for_finetune.py                    # backbone code of network
+    ├─bert_for_pre_training.py                # backbone code of network
+    ├─bert_model.py                           # backbone code of network
+    ├─clue_classification_dataset_precess.py  # data preprocessing
+    ├─cluner_evaluation.py                    # evaluation for cluner   
+    ├─config.py                               # parameter configuration for pretraining
+    ├─CRF.py                                  # assessment method for clue dataset 
+    ├─dataset.py                              # data preprocessing
+    ├─finetune_eval_config.py                 # parameter configuration for finetuning
+    ├─finetune_eval_model.py                  # backbone code of network
+    ├─fused_layer_norm.py                     # Layernormal is optimized for Ascend
+    ├─sample_process.py                       # sample processing
+    ├─utils.py                                # util function
+  ├─pretrain_eval.py                          # train and eval net  
+  ├─run_classifier.py                         # finetune and eval net for classifier task
+  ├─run_ner.py                                # finetune and eval net for ner task
+  ├─run_pretrain.py                           # train net for pretraining phase
+  └─run_squad.py                              # finetune and eval net for squad task
+```
+
+## [Script Parameters](#contents)
 ### Pre-Training
 ``` 
 usage: run_pretrain.py  [--distribute DISTRIBUTE] [--epoch_size N] [----device_num N] [--device_id N] 
-                        [--enable_save_ckpt ENABLE_SAVE_CKPT]
+                        [--enable_save_ckpt ENABLE_SAVE_CKPT] [--device_target DEVICE_TARGET]
                         [--enable_lossscale ENABLE_LOSSSCALE] [--do_shuffle DO_SHUFFLE]
-                        [--enable_data_sink ENABLE_DATA_SINK] [--data_sink_steps N] [--checkpoint_path CHECKPOINT_PATH]
+                        [--enable_data_sink ENABLE_DATA_SINK] [--data_sink_steps N] 
+                        [--save_checkpoint_path SAVE_CHECKPOINT_PATH]
+                        [--load_checkpoint_path LOAD_CHECKPOINT_PATH]
                         [--save_checkpoint_steps N] [--save_checkpoint_num N] 
-                        [--data_dir DATA_DIR] [--schema_dir SCHEMA_DIR]
+                        [--data_dir DATA_DIR] [--schema_dir SCHEMA_DIR] [train_steps N]
 
 options:
+    --device_target            device where the code will be implemented: "Ascend" | "GPU", default is "Ascend"
     --distribute               pre_training by serveral devices: "true"(training by more than 1 device) | "false", default is "false"
     --epoch_size               epoch size: N, default is 1
     --device_num               number of used devices: N, default is 1
@@ -84,9 +139,11 @@ options:
     --do_shuffle               enable shuffle: "true" | "false", default is "true"
     --enable_data_sink         enable data sink: "true" | "false", default is "true"
     --data_sink_steps          set data sink steps: N, default is 1
-    --checkpoint_path          path to save checkpoint files: PATH, default is ""
+    --save_checkpoint_path     path to save checkpoint files: PATH, default is ""
+    --load_checkpoint_path     path to load checkpoint files: PATH, default is ""
     --save_checkpoint_steps    steps for saving checkpoint files: N, default is 1000
     --save_checkpoint_num      number for saving checkpoint files: N, default is 1
+    --train_steps              Training Steps: N, default is -1
     --data_dir                 path to dataset directory: PATH, default is ""
     --schema_dir               path to schema.json file, PATH, default is ""
 ```
@@ -96,13 +153,15 @@ usage: run_ner.py   [--device_target DEVICE_TARGET] [--do_train DO_TRAIN] [----d
                     [--assessment_method ASSESSMENT_METHOD] [--use_crf USE_CRF] 
                     [--device_id N] [--epoch_num N] [--vocab_file_path VOCAB_FILE_PATH]
                     [--label2id_file_path LABEL2ID_FILE_PATH] 
+                    [--train_data_shuffle TRAIN_DATA_SHUFFLE] 
+                    [--eval_data_shuffle EVAL_DATA_SHUFFLE] 
                     [--save_finetune_checkpoint_path SAVE_FINETUNE_CHECKPOINT_PATH]
                     [--load_pretrain_checkpoint_path LOAD_PRETRAIN_CHECKPOINT_PATH] 
                     [--train_data_file_path TRAIN_DATA_FILE_PATH] 
                     [--eval_data_file_path EVAL_DATA_FILE_PATH] 
                     [--schema_file_path SCHEMA_FILE_PATH]
 options:
-    --device_target                   targeted device to run task: Ascend | GPU
+    --device_target            device where the code will be implemented: "Ascend" | "GPU", default is "Ascend"
     --do_train                        whether to run training on training set: true | false
     --do_eval                         whether to run eval on dev set: true | false
     --assessment_method               assessment method to do evaluation: f1 | clue_benchmark
@@ -110,6 +169,8 @@ options:
     --device_id                       device id to run task
     --epoch_num                       total number of training epochs to perform
     --num_class                       number of classes to do labeling
+    --train_data_shuffle              Enable train data shuffle, default is true
+    --eval_data_shuffle               Enable eval data shuffle, default is true
     --vocab_file_path                 the vocabulary file that the BERT model was trained on
     --label2id_file_path              label to id json file
     --save_finetune_checkpoint_path   path to save generated finetuning checkpoint
@@ -123,6 +184,8 @@ usage: run_squad.py [--device_target DEVICE_TARGET] [--do_train DO_TRAIN] [----d
                     [--device_id N] [--epoch_num N] [--num_class N]
                     [--vocab_file_path VOCAB_FILE_PATH]
                     [--eval_json_path EVAL_JSON_PATH] 
+                    [--train_data_shuffle TRAIN_DATA_SHUFFLE] 
+                    [--eval_data_shuffle EVAL_DATA_SHUFFLE] 
                     [--save_finetune_checkpoint_path SAVE_FINETUNE_CHECKPOINT_PATH]
                     [--load_pretrain_checkpoint_path LOAD_PRETRAIN_CHECKPOINT_PATH] 
                     [--load_finetune_checkpoint_path LOAD_FINETUNE_CHECKPOINT_PATH] 
@@ -130,12 +193,14 @@ usage: run_squad.py [--device_target DEVICE_TARGET] [--do_train DO_TRAIN] [----d
                     [--eval_data_file_path EVAL_DATA_FILE_PATH] 
                     [--schema_file_path SCHEMA_FILE_PATH]
 options:
-    --device_target                   targeted device to run task: Ascend | GPU
+    --device_target            device where the code will be implemented: "Ascend" | "GPU", default is "Ascend"
     --do_train                        whether to run training on training set: true | false
     --do_eval                         whether to run eval on dev set: true | false
     --device_id                       device id to run task
     --epoch_num                       total number of training epochs to perform
     --num_class                       number of classes to classify, usually 2 for squad task
+    --train_data_shuffle              Enable train data shuffle, default is true
+    --eval_data_shuffle               Enable eval data shuffle, default is true
     --vocab_file_path                 the vocabulary file that the BERT model was trained on
     --eval_json_path                  path to squad dev json file
     --save_finetune_checkpoint_path   path to save generated finetuning checkpoint
@@ -150,6 +215,8 @@ usage: run_classifier.py [--device_target DEVICE_TARGET] [--do_train DO_TRAIN] [
                          [--save_finetune_checkpoint_path SAVE_FINETUNE_CHECKPOINT_PATH]
                          [--load_pretrain_checkpoint_path LOAD_PRETRAIN_CHECKPOINT_PATH] 
                          [--load_finetune_checkpoint_path LOAD_FINETUNE_CHECKPOINT_PATH] 
+                         [--train_data_shuffle TRAIN_DATA_SHUFFLE] 
+                         [--eval_data_shuffle EVAL_DATA_SHUFFLE] 
                          [--train_data_file_path TRAIN_DATA_FILE_PATH] 
                          [--eval_data_file_path EVAL_DATA_FILE_PATH] 
                          [--schema_file_path SCHEMA_FILE_PATH]
@@ -161,6 +228,8 @@ options:
     --device_id                       device id to run task
     --epoch_num                       total number of training epochs to perform
     --num_class                       number of classes to do labeling
+    --train_data_shuffle              Enable train data shuffle, default is true
+    --eval_data_shuffle               Enable eval data shuffle, default is true
     --save_finetune_checkpoint_path   path to save generated finetuning checkpoint
     --load_pretrain_checkpoint_path   initial checkpoint (usually from a pre-trained BERT model)
     --load_finetune_checkpoint_path   give a finetuning checkpoint path if only do eval
@@ -169,10 +238,10 @@ options:
     --schema_file_path                path to datafile schema file
 ```
 ## Options and Parameters
-It contains of parameters of BERT model and options for training, which is set in file `config.py` and `finetune_eval_config.py` respectively.
+Parameters for training and evaluation can be set in file `config.py` and `finetune_eval_config.py` respectively.
 ### Options:
 ```
-config.py:
+config for lossscale and etc.
     bert_network                    version of BERT model: base | nezha, default is base
     loss_scale_value                initial value of loss scale: N, default is 2^32
     scale_factor                    factor used to update loss scale: N, default is 2
@@ -225,3 +294,133 @@ Parameters for optimizer:
     momentum                        momentum for the moving average: Q
 ```
 
+## [Training Process](#contents)
+### Training
+#### running on Ascend
+```
+bash scripts/run_standalone_pretrain_ascend.sh 0 1 /path/cn-wiki-128
+```
+The command above will run in the background, you can view the results the file pretraining_log.txt. After training, you will get some checkpoint files under the script folder by default. The loss value will be achieved as follows:
+```
+# grep "epoch" pretraining_log.txt
+epoch: 0.0, current epoch percent: 0.000, step: 1, outpus are (Tensor(shape=[1], dtype=Float32, [ 1.0856101e+01]), Tensor(shape=[], dtype=Bool, False), Tensor(shape=[], dtype=Float32, 65536))
+epoch: 0.0, current epoch percent: 0.000, step: 2, outpus are (Tensor(shape=[1], dtype=Float32, [ 1.0821701e+01]), Tensor(shape=[], dtype=Bool, False), Tensor(shape=[], dtype=Float32, 65536))
+...
+```
+
+### Distributed Training
+#### running on Ascend
+```
+bash scripts/run_distributed_pretrain_ascend.sh /path/cn-wiki-128 /path/hccl.json
+```
+The command above will run in the background, you can view the results the file pretraining_log.txt. After training, you will get some checkpoint files under the LOG* folder by default. The loss value will be achieved as follows:
+```
+# grep "epoch" LOG*/pretraining_log.txt
+epoch: 0.0, current epoch percent: 0.001, step: 100, outpus are (Tensor(shape=[1], dtype=Float32, [ 1.08209e+01]), Tensor(shape=[], dtype=Bool, False), Tensor(shape=[], dtype=Float32, 65536))
+epoch: 0.0, current epoch percent: 0.002, step: 200, outpus are (Tensor(shape=[1], dtype=Float32, [ 1.07566e+01]), Tensor(shape=[], dtype=Bool, False), Tensor(shape=[], dtype=Float32, 65536))
+...
+epoch: 0.0, current epoch percent: 0.001, step: 100, outpus are (Tensor(shape=[1], dtype=Float32, [ 1.08218e+01]), Tensor(shape=[], dtype=Bool, False), Tensor(shape=[], dtype=Float32, 65536))
+epoch: 0.0, current epoch percent: 0.002, step: 200, outpus are (Tensor(shape=[1], dtype=Float32, [ 1.07770e+01]), Tensor(shape=[], dtype=Bool, False), Tensor(shape=[], dtype=Float32, 65536))
+...
+```
+
+## [Evaluation Process](#contents)
+### Evaluation
+#### evaluation on cola dataset when running on Ascend
+Before running the command below, please check the load pretrain checkpoint path has been set. Please set the checkpoint path to be the absolute full path, e.g:"/username/pretrain/checkpoint_100_300.ckpt".
+```
+bash scripts/run_classifier.sh
+
+The command above will run in the background, you can view the results the file classfier_log.txt.
+
+If you choose accuracy as assessment method, the result will be as follows:
+acc_num XXX, total_num XXX, accuracy 0.588986
+```
+
+#### evaluation on cluener dataset when running on Ascend    
+```
+bash scripts/ner.sh
+
+The command above will run in the background, you can view the results the file ner_log.txt.
+
+If you choose F1 as assessment method, the result will be as follows:
+Precision 0.920507
+Recall 0.948683
+F1 0.920507
+```
+    
+#### evaluation on squad v1.1 dataset when running on Ascend   
+```
+bash scripts/squad.sh
+
+The command above will run in the background, you can view the results the file squad_log.txt.
+The result will be as follows:
+{"exact_match": 80.3878923040233284, "f1": 87.6902384023850329}
+```
+
+## [Model Description](#contents)
+## [Performance](#contents)
+### Pretraining Performance
+| Parameters                 | BERT                                                       | BERT                      |
+| -------------------------- | ---------------------------------------------------------- | ------------------------- |
+| Model Version              | base                                                       | base                      |
+| Resource                   | Ascend 910, cpu:2.60GHz 56cores, memory:314G               | NV SMX2 V100-32G          |
+| uploaded Date              | 08/22/2020                                                 | 05/06/2020                |
+| MindSpore Version          | 0.6.0                                                      | 0.3.0                     |
+| Dataset                    | cn-wiki-128                                                | ImageNet                  |
+| Training Parameters        | src/config.py                                              | src/config.py             |
+| Optimizer                  | Lamb                                                       | Momentum                  |
+| Loss Function              | SoftmaxCrossEntropy                                        | SoftmaxCrossEntropy       |
+| outputs                    | probability                                                |                           |
+| Loss                       |                                                            | 1.913                     |
+| Speed                      | 116.5 ms/step                                              | 1.913                     |
+| Total time                 |                                                            |                           |
+| Params (M)                 | 110M                                                       |                           |
+| Checkpoint for Fine tuning | 1.2G(.ckpt file)                                           |                           |
+
+
+| Parameters                 | BERT                                                       | BERT                      |
+| -------------------------- | ---------------------------------------------------------- | ------------------------- |
+| Model Version              | NEZHA                                                      | NEZHA                     |
+| Resource                   | Ascend 910, cpu:2.60GHz 56cores, memory:314G               | NV SMX2 V100-32G          |
+| uploaded Date              | 08/20/2020                                                 | 05/06/2020                |
+| MindSpore Version          | 0.6.0                                                      | 0.3.0                     |
+| Dataset                    | cn-wiki-128                                                | ImageNet                  |
+| Training Parameters        | src/config.py                                              | src/config.py             |
+| Optimizer                  | Lamb                                                       | Momentum                  |
+| Loss Function              | SoftmaxCrossEntropy                                        | SoftmaxCrossEntropy       |
+| outputs                    | probability                                                |                           |
+| Loss                       |                                                            | 1.913                     |
+| Speed                      |                                                            | 1.913                     |
+| Total time                 |                                                            |                           |
+| Params (M)                 | 340M                                                       |                           |
+| Checkpoint for Fine tuning | 3.2G(.ckpt file)                                           |                           |             
+
+#### Inference Performance
+
+| Parameters                 |                               |                           |                      |
+| -------------------------- | ----------------------------- | ------------------------- | -------------------- |
+| Model Version              | V1                            |                           |                      |
+| Resource                   | Huawei 910                    | NV SMX2 V100-32G          | Huawei 310           |
+| uploaded Date              | 08/22/2020                    | 05/22/2020                |                      |
+| MindSpore Version          | 0.6.0                         | 0.2.0                     | 0.2.0                | 
+| Dataset                    | cola, 1.2W                    | ImageNet, 1.2W            | ImageNet, 1.2W       |
+| batch_size                 | 32(1P)                        | 130(8P)                   |                      |
+| Accuracy                   | 0.588986                      | ACC1[72.07%] ACC5[90.90%] |                      |
+| Speed                      | 59.25ms/step                  |                           |                      |
+| Total time                 |                               |                           |                      |
+| Model for inference        | 1.2G(.ckpt file)              |                           |                      |
+
+# [Description of Random Situation](#contents)
+
+In run_standalone_pretrain.sh and run_distributed_pretrain.sh, we set do_shuffle to shuffle the dataset. 
+
+In run_classifier.sh, run_ner.sh and run_squad.sh, we set train_data_shuffle and eval_data_shuffle to shuffle the dataset.
+
+In config.py, we set the hidden_dropout_prob and attention_pros_dropout_prob to dropout some network node.
+
+In run_pretrain.py, we set the random seed to make sure distribute training has the same init weight.
+
+# [ModelZoo Homepage](#contents)
+ 
+Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo). 
diff --git a/model_zoo/official/nlp/bert/run_classifier.py b/model_zoo/official/nlp/bert/run_classifier.py
index e9a7e7864..09e38794f 100644
--- a/model_zoo/official/nlp/bert/run_classifier.py
+++ b/model_zoo/official/nlp/bert/run_classifier.py
@@ -48,7 +48,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
                                        warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
                                        decay_steps=steps_per_epoch * epoch_num,
                                        power=optimizer_cfg.AdamWeightDecay.power)
-        params = net_with_loss.trainable_params()
+        params = network.trainable_params()
         decay_params = list(filter(optimizer_cfg.AdamWeightDecay.decay_filter, params))
         other_params = list(filter(lambda x: not optimizer_cfg.AdamWeightDecay.decay_filter(x), params))
         group_params = [{'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay},
diff --git a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py
index 41f656d03..6489f7c1e 100644
--- a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py
+++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py
@@ -133,7 +133,7 @@ def distribute_pretrain():
         cmd += opt
         cmd += " --data_dir=" + data_dir
         cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
-               + str(rank_size) + ' >./log.txt 2>&1 &'
+               + str(rank_size) + ' >./pretraining_log.txt 2>&1 &'
 
         os.system(cmd)
         os.chdir(cur_dir)
diff --git a/model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/ms2tf_config.py b/model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/ms2tf_config.py
deleted file mode 100644
index 085fe72cc..000000000
--- a/model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/ms2tf_config.py
+++ /dev/null
@@ -1,636 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""
-    # tf and ms bert large checkpoint param transfer table
-    # key:   tf
-    # value: ms
-"""
-param_name_dict = {
-    'bert/embeddings/word_embeddings': 'bert.bert.bert_embedding_lookup.embedding_table',
-    'bert/embeddings/token_type_embeddings': 'bert.bert.bert_embedding_postprocessor.embedding_table',
-    'bert/embeddings/position_embeddings': 'bert.bert.bert_embedding_postprocessor.full_position_embeddings',
-    'bert/embeddings/LayerNorm/gamma': 'bert.bert.bert_embedding_postprocessor.layernorm.gamma',
-    'bert/embeddings/LayerNorm/beta': 'bert.bert.bert_embedding_postprocessor.layernorm.beta',
-    'bert/encoder/layer_0/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.0.attention.attention'
-                                                        '.query_layer.weight',
-    'bert/encoder/layer_0/attention/self/query/bias': 'bert.bert.bert_encoder.layers.0.attention.attention'
-                                                      '.query_layer.bias',
-    'bert/encoder/layer_0/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.0.attention.attention.key_layer'
-                                                      '.weight',
-    'bert/encoder/layer_0/attention/self/key/bias': 'bert.bert.bert_encoder.layers.0.attention.attention'
-                                                    '.key_layer.bias',
-    'bert/encoder/layer_0/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.0.attention.attention'
-                                                        '.value_layer.weight',
-    'bert/encoder/layer_0/attention/self/value/bias': 'bert.bert.bert_encoder.layers.0.attention.attention'
-                                                      '.value_layer.bias',
-    'bert/encoder/layer_0/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.0.attention.output.dense'
-                                                          '.weight',
-    'bert/encoder/layer_0/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.0.attention.output.dense.bias',
-    'bert/encoder/layer_0/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.0.attention.output'
-                                                             '.layernorm.gamma',
-    'bert/encoder/layer_0/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.0.attention.output'
-                                                            '.layernorm.beta',
-    'bert/encoder/layer_0/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.0.intermediate.weight',
-    'bert/encoder/layer_0/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.0.intermediate.bias',
-    'bert/encoder/layer_0/output/dense/kernel': 'bert.bert.bert_encoder.layers.0.output.dense.weight',
-    'bert/encoder/layer_0/output/dense/bias': 'bert.bert.bert_encoder.layers.0.output.dense.bias',
-    'bert/encoder/layer_0/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.0.output.layernorm.gamma',
-    'bert/encoder/layer_0/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.0.output.layernorm.beta',
-    'bert/encoder/layer_1/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.1.attention.attention'
-                                                        '.query_layer.weight',
-    'bert/encoder/layer_1/attention/self/query/bias': 'bert.bert.bert_encoder.layers.1.attention.attention'
-                                                      '.query_layer.bias',
-    'bert/encoder/layer_1/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.1.attention.attention.key_layer'
-                                                      '.weight',
-    'bert/encoder/layer_1/attention/self/key/bias': 'bert.bert.bert_encoder.layers.1.attention'
-                                                    '.attention.key_layer.bias',
-    'bert/encoder/layer_1/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.1.attention.attention'
-                                                        '.value_layer.weight',
-    'bert/encoder/layer_1/attention/self/value/bias': 'bert.bert.bert_encoder.layers.1.attention.attention'
-                                                      '.value_layer.bias',
-    'bert/encoder/layer_1/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.1.attention.output.dense'
-                                                          '.weight',
-    'bert/encoder/layer_1/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.1.attention.output.dense.bias',
-    'bert/encoder/layer_1/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.1.attention.output'
-                                                             '.layernorm.gamma',
-    'bert/encoder/layer_1/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.1.attention.output'
-                                                            '.layernorm.beta',
-    'bert/encoder/layer_1/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.1.intermediate.weight',
-    'bert/encoder/layer_1/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.1.intermediate.bias',
-    'bert/encoder/layer_1/output/dense/kernel': 'bert.bert.bert_encoder.layers.1.output.dense.weight',
-    'bert/encoder/layer_1/output/dense/bias': 'bert.bert.bert_encoder.layers.1.output.dense.bias',
-    'bert/encoder/layer_1/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.1.output.layernorm.gamma',
-    'bert/encoder/layer_1/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.1.output.layernorm.beta',
-    'bert/encoder/layer_2/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.2.attention.attention'
-                                                        '.query_layer.weight',
-    'bert/encoder/layer_2/attention/self/query/bias': 'bert.bert.bert_encoder.layers.2.attention.attention'
-                                                      '.query_layer.bias',
-    'bert/encoder/layer_2/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.2.attention.attention.key_layer'
-                                                      '.weight',
-    'bert/encoder/layer_2/attention/self/key/bias': 'bert.bert.bert_encoder.layers.2.attention'
-                                                    '.attention.key_layer.bias',
-    'bert/encoder/layer_2/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.2.attention.attention'
-                                                        '.value_layer.weight',
-    'bert/encoder/layer_2/attention/self/value/bias': 'bert.bert.bert_encoder.layers.2.attention.attention'
-                                                      '.value_layer.bias',
-    'bert/encoder/layer_2/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.2.attention.output.dense'
-                                                          '.weight',
-    'bert/encoder/layer_2/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.2.attention.output.dense.bias',
-    'bert/encoder/layer_2/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.2.attention.output'
-                                                             '.layernorm.gamma',
-    'bert/encoder/layer_2/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.2.attention.output'
-                                                            '.layernorm.beta',
-    'bert/encoder/layer_2/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.2.intermediate.weight',
-    'bert/encoder/layer_2/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.2.intermediate.bias',
-    'bert/encoder/layer_2/output/dense/kernel': 'bert.bert.bert_encoder.layers.2.output.dense.weight',
-    'bert/encoder/layer_2/output/dense/bias': 'bert.bert.bert_encoder.layers.2.output.dense.bias',
-    'bert/encoder/layer_2/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.2.output.layernorm.gamma',
-    'bert/encoder/layer_2/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.2.output.layernorm.beta',
-    'bert/encoder/layer_3/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.3.attention.attention'
-                                                        '.query_layer.weight',
-    'bert/encoder/layer_3/attention/self/query/bias': 'bert.bert.bert_encoder.layers.3.attention.attention'
-                                                      '.query_layer.bias',
-    'bert/encoder/layer_3/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.3.attention.attention.key_layer'
-                                                      '.weight',
-    'bert/encoder/layer_3/attention/self/key/bias': 'bert.bert.bert_encoder.layers.3.attention'
-                                                    '.attention.key_layer.bias',
-    'bert/encoder/layer_3/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.3.attention.attention'
-                                                        '.value_layer.weight',
-    'bert/encoder/layer_3/attention/self/value/bias': 'bert.bert.bert_encoder.layers.3.attention.attention'
-                                                      '.value_layer.bias',
-    'bert/encoder/layer_3/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.3.attention.output.dense'
-                                                          '.weight',
-    'bert/encoder/layer_3/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.3.attention.output.dense.bias',
-    'bert/encoder/layer_3/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.3.attention.output'
-                                                             '.layernorm.gamma',
-    'bert/encoder/layer_3/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.3.attention.output'
-                                                            '.layernorm.beta',
-    'bert/encoder/layer_3/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.3.intermediate.weight',
-    'bert/encoder/layer_3/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.3.intermediate.bias',
-    'bert/encoder/layer_3/output/dense/kernel': 'bert.bert.bert_encoder.layers.3.output.dense.weight',
-    'bert/encoder/layer_3/output/dense/bias': 'bert.bert.bert_encoder.layers.3.output.dense.bias',
-    'bert/encoder/layer_3/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.3.output.layernorm.gamma',
-    'bert/encoder/layer_3/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.3.output.layernorm.beta',
-    'bert/encoder/layer_4/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.4.attention.attention'
-                                                        '.query_layer.weight',
-    'bert/encoder/layer_4/attention/self/query/bias': 'bert.bert.bert_encoder.layers.4.attention.attention'
-                                                      '.query_layer.bias',
-    'bert/encoder/layer_4/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.4.attention.attention.key_layer'
-                                                      '.weight',
-    'bert/encoder/layer_4/attention/self/key/bias': 'bert.bert.bert_encoder.layers.4'
-                                                    '.attention.attention.key_layer.bias',
-    'bert/encoder/layer_4/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.4.attention.attention'
-                                                        '.value_layer.weight',
-    'bert/encoder/layer_4/attention/self/value/bias': 'bert.bert.bert_encoder.layers.4.attention.attention'
-                                                      '.value_layer.bias',
-    'bert/encoder/layer_4/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.4.attention.output.dense'
-                                                          '.weight',
-    'bert/encoder/layer_4/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.4.attention.output.dense.bias',
-    'bert/encoder/layer_4/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.4.attention.output'
-                                                             '.layernorm.gamma',
-    'bert/encoder/layer_4/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.4.attention.output'
-                                                            '.layernorm.beta',
-    'bert/encoder/layer_4/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.4.intermediate.weight',
-    'bert/encoder/layer_4/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.4.intermediate.bias',
-    'bert/encoder/layer_4/output/dense/kernel': 'bert.bert.bert_encoder.layers.4.output.dense.weight',
-    'bert/encoder/layer_4/output/dense/bias': 'bert.bert.bert_encoder.layers.4.output.dense.bias',
-    'bert/encoder/layer_4/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.4.output.layernorm.gamma',
-    'bert/encoder/layer_4/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.4.output.layernorm.beta',
-    'bert/encoder/layer_5/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.5.attention.attention'
-                                                        '.query_layer.weight',
-    'bert/encoder/layer_5/attention/self/query/bias': 'bert.bert.bert_encoder.layers.5.attention.attention'
-                                                      '.query_layer.bias',
-    'bert/encoder/layer_5/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.5.attention.attention.key_layer'
-                                                      '.weight',
-    'bert/encoder/layer_5/attention/self/key/bias': 'bert.bert.bert_encoder.layers.5.attention'
-                                                    '.attention.key_layer.bias',
-    'bert/encoder/layer_5/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.5.attention.attention'
-                                                        '.value_layer.weight',
-    'bert/encoder/layer_5/attention/self/value/bias': 'bert.bert.bert_encoder.layers.5.attention.attention'
-                                                      '.value_layer.bias',
-    'bert/encoder/layer_5/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.5.attention.output.dense'
-                                                          '.weight',
-    'bert/encoder/layer_5/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.5.attention.output.dense.bias',
-    'bert/encoder/layer_5/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.5.attention.output'
-                                                             '.layernorm.gamma',
-    'bert/encoder/layer_5/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.5.attention.output'
-                                                            '.layernorm.beta',
-    'bert/encoder/layer_5/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.5.intermediate.weight',
-    'bert/encoder/layer_5/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.5.intermediate.bias',
-    'bert/encoder/layer_5/output/dense/kernel': 'bert.bert.bert_encoder.layers.5.output.dense.weight',
-    'bert/encoder/layer_5/output/dense/bias': 'bert.bert.bert_encoder.layers.5.output.dense.bias',
-    'bert/encoder/layer_5/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.5.output.layernorm.gamma',
-    'bert/encoder/layer_5/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.5.output.layernorm.beta',
-    'bert/encoder/layer_6/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.6.attention.attention'
-                                                        '.query_layer.weight',
-    'bert/encoder/layer_6/attention/self/query/bias': 'bert.bert.bert_encoder.layers.6.attention.attention'
-                                                      '.query_layer.bias',
-    'bert/encoder/layer_6/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.6.attention.attention.key_layer'
-                                                      '.weight',
-    'bert/encoder/layer_6/attention/self/key/bias': 'bert.bert.bert_encoder.layers.6.attention'
-                                                    '.attention.key_layer.bias',
-    'bert/encoder/layer_6/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.6.attention.attention'
-                                                        '.value_layer.weight',
-    'bert/encoder/layer_6/attention/self/value/bias': 'bert.bert.bert_encoder.layers.6.attention.attention'
-                                                      '.value_layer.bias',
-    'bert/encoder/layer_6/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.6.attention.output.dense'
-                                                          '.weight',
-    'bert/encoder/layer_6/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.6.attention.output.dense.bias',
-    'bert/encoder/layer_6/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.6.attention.output'
-                                                             '.layernorm.gamma',
-    'bert/encoder/layer_6/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.6.attention.output'
-                                                            '.layernorm.beta',
-    'bert/encoder/layer_6/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.6.intermediate.weight',
-    'bert/encoder/layer_6/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.6.intermediate.bias',
-    'bert/encoder/layer_6/output/dense/kernel': 'bert.bert.bert_encoder.layers.6.output.dense.weight',
-    'bert/encoder/layer_6/output/dense/bias': 'bert.bert.bert_encoder.layers.6.output.dense.bias',
-    'bert/encoder/layer_6/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.6.output.layernorm.gamma',
-    'bert/encoder/layer_6/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.6.output.layernorm.beta',
-    'bert/encoder/layer_7/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.7.attention.attention'
-                                                        '.query_layer.weight',
-    'bert/encoder/layer_7/attention/self/query/bias': 'bert.bert.bert_encoder.layers.7.attention.attention'
-                                                      '.query_layer.bias',
-    'bert/encoder/layer_7/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.7.attention.attention.key_layer'
-                                                      '.weight',
-    'bert/encoder/layer_7/attention/self/key/bias': 'bert.bert.bert_encoder.layers.7.attention'
-                                                    '.attention.key_layer.bias',
-    'bert/encoder/layer_7/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.7.attention.attention'
-                                                        '.value_layer.weight',
-    'bert/encoder/layer_7/attention/self/value/bias': 'bert.bert.bert_encoder.layers.7.attention.attention'
-                                                      '.value_layer.bias',
-    'bert/encoder/layer_7/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.7.attention.output.dense'
-                                                          '.weight',
-    'bert/encoder/layer_7/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.7.attention.output.dense.bias',
-    'bert/encoder/layer_7/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.7.attention.output'
-                                                             '.layernorm.gamma',
-    'bert/encoder/layer_7/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.7.attention.output'
-                                                            '.layernorm.beta',
-    'bert/encoder/layer_7/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.7.intermediate.weight',
-    'bert/encoder/layer_7/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.7.intermediate.bias',
-    'bert/encoder/layer_7/output/dense/kernel': 'bert.bert.bert_encoder.layers.7.output.dense.weight',
-    'bert/encoder/layer_7/output/dense/bias': 'bert.bert.bert_encoder.layers.7.output.dense.bias',
-    'bert/encoder/layer_7/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.7.output.layernorm.gamma',
-    'bert/encoder/layer_7/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.7.output.layernorm.beta',
-    'bert/encoder/layer_8/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.8.attention.attention'
-                                                        '.query_layer.weight',
-    'bert/encoder/layer_8/attention/self/query/bias': 'bert.bert.bert_encoder.layers.8.attention.attention'
-                                                      '.query_layer.bias',
-    'bert/encoder/layer_8/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.8.attention.attention.key_layer'
-                                                      '.weight',
-    'bert/encoder/layer_8/attention/self/key/bias': 'bert.bert.bert_encoder.layers.8.attention'
-                                                    '.attention.key_layer.bias',
-    'bert/encoder/layer_8/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.8.attention.attention'
-                                                        '.value_layer.weight',
-    'bert/encoder/layer_8/attention/self/value/bias': 'bert.bert.bert_encoder.layers.8.attention.attention'
-                                                      '.value_layer.bias',
-    'bert/encoder/layer_8/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.8.attention.output.dense'
-                                                          '.weight',
-    'bert/encoder/layer_8/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.8.attention.output.dense.bias',
-    'bert/encoder/layer_8/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.8.attention.output'
-                                                             '.layernorm.gamma',
-    'bert/encoder/layer_8/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.8.attention.output'
-                                                            '.layernorm.beta',
-    'bert/encoder/layer_8/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.8.intermediate.weight',
-    'bert/encoder/layer_8/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.8.intermediate.bias',
-    'bert/encoder/layer_8/output/dense/kernel': 'bert.bert.bert_encoder.layers.8.output.dense.weight',
-    'bert/encoder/layer_8/output/dense/bias': 'bert.bert.bert_encoder.layers.8.output.dense.bias',
-    'bert/encoder/layer_8/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.8.output.layernorm.gamma',
-    'bert/encoder/layer_8/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.8.output.layernorm.beta',
-    'bert/encoder/layer_9/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.9.attention.attention'
-                                                        '.query_layer.weight',
-    'bert/encoder/layer_9/attention/self/query/bias': 'bert.bert.bert_encoder.layers.9.attention.attention'
-                                                      '.query_layer.bias',
-    'bert/encoder/layer_9/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.9.attention.attention.key_layer'
-                                                      '.weight',
-    'bert/encoder/layer_9/attention/self/key/bias': 'bert.bert.bert_encoder.layers.9.attention'
-                                                    '.attention.key_layer.bias',
-    'bert/encoder/layer_9/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.9.attention.attention'
-                                                        '.value_layer.weight',
-    'bert/encoder/layer_9/attention/self/value/bias': 'bert.bert.bert_encoder.layers.9.attention.attention'
-                                                      '.value_layer.bias',
-    'bert/encoder/layer_9/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.9.attention.output.dense'
-                                                          '.weight',
-    'bert/encoder/layer_9/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.9.attention.output.dense.bias',
-    'bert/encoder/layer_9/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.9.attention.output'
-                                                             '.layernorm.gamma',
-    'bert/encoder/layer_9/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.9.attention.output'
-                                                            '.layernorm.beta',
-    'bert/encoder/layer_9/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.9.intermediate.weight',
-    'bert/encoder/layer_9/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.9.intermediate.bias',
-    'bert/encoder/layer_9/output/dense/kernel': 'bert.bert.bert_encoder.layers.9.output.dense.weight',
-    'bert/encoder/layer_9/output/dense/bias': 'bert.bert.bert_encoder.layers.9.output.dense.bias',
-    'bert/encoder/layer_9/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.9.output.layernorm.gamma',
-    'bert/encoder/layer_9/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.9.output.layernorm.beta',
-    'bert/encoder/layer_10/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.10.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_10/attention/self/query/bias': 'bert.bert.bert_encoder.layers.10.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_10/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.10.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_10/attention/self/key/bias': 'bert.bert.bert_encoder.layers.10.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_10/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.10.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_10/attention/self/value/bias': 'bert.bert.bert_encoder.layers.10.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_10/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.10.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_10/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.10.attention.output.dense.bias',
-    'bert/encoder/layer_10/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.10.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_10/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.10.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_10/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.10.intermediate.weight',
-    'bert/encoder/layer_10/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.10.intermediate.bias',
-    'bert/encoder/layer_10/output/dense/kernel': 'bert.bert.bert_encoder.layers.10.output.dense.weight',
-    'bert/encoder/layer_10/output/dense/bias': 'bert.bert.bert_encoder.layers.10.output.dense.bias',
-    'bert/encoder/layer_10/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.10.output.layernorm.gamma',
-    'bert/encoder/layer_10/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.10.output.layernorm.beta',
-    'bert/encoder/layer_11/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.11.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_11/attention/self/query/bias': 'bert.bert.bert_encoder.layers.11.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_11/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.11.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_11/attention/self/key/bias': 'bert.bert.bert_encoder.layers.11.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_11/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.11.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_11/attention/self/value/bias': 'bert.bert.bert_encoder.layers.11.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_11/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.11.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_11/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.11.attention.output.dense.bias',
-    'bert/encoder/layer_11/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.11.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_11/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.11.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_11/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.11.intermediate.weight',
-    'bert/encoder/layer_11/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.11.intermediate.bias',
-    'bert/encoder/layer_11/output/dense/kernel': 'bert.bert.bert_encoder.layers.11.output.dense.weight',
-    'bert/encoder/layer_11/output/dense/bias': 'bert.bert.bert_encoder.layers.11.output.dense.bias',
-    'bert/encoder/layer_11/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.11.output.layernorm.gamma',
-    'bert/encoder/layer_11/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.11.output.layernorm.beta',
-    'bert/encoder/layer_12/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.12.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_12/attention/self/query/bias': 'bert.bert.bert_encoder.layers.12.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_12/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.12.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_12/attention/self/key/bias': 'bert.bert.bert_encoder.layers.12.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_12/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.12.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_12/attention/self/value/bias': 'bert.bert.bert_encoder.layers.12.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_12/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.12.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_12/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.12.attention.output.dense.bias',
-    'bert/encoder/layer_12/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.12.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_12/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.12.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_12/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.12.intermediate.weight',
-    'bert/encoder/layer_12/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.12.intermediate.bias',
-    'bert/encoder/layer_12/output/dense/kernel': 'bert.bert.bert_encoder.layers.12.output.dense.weight',
-    'bert/encoder/layer_12/output/dense/bias': 'bert.bert.bert_encoder.layers.12.output.dense.bias',
-    'bert/encoder/layer_12/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.12.output.layernorm.gamma',
-    'bert/encoder/layer_12/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.12.output.layernorm.beta',
-    'bert/encoder/layer_13/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.13.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_13/attention/self/query/bias': 'bert.bert.bert_encoder.layers.13.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_13/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.13.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_13/attention/self/key/bias': 'bert.bert.bert_encoder.layers.13.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_13/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.13.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_13/attention/self/value/bias': 'bert.bert.bert_encoder.layers.13.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_13/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.13.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_13/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.13.attention.output.dense.bias',
-    'bert/encoder/layer_13/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.13.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_13/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.13.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_13/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.13.intermediate.weight',
-    'bert/encoder/layer_13/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.13.intermediate.bias',
-    'bert/encoder/layer_13/output/dense/kernel': 'bert.bert.bert_encoder.layers.13.output.dense.weight',
-    'bert/encoder/layer_13/output/dense/bias': 'bert.bert.bert_encoder.layers.13.output.dense.bias',
-    'bert/encoder/layer_13/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.13.output.layernorm.gamma',
-    'bert/encoder/layer_13/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.13.output.layernorm.beta',
-    'bert/encoder/layer_14/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.14.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_14/attention/self/query/bias': 'bert.bert.bert_encoder.layers.14.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_14/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.14.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_14/attention/self/key/bias': 'bert.bert.bert_encoder.layers.14.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_14/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.14.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_14/attention/self/value/bias': 'bert.bert.bert_encoder.layers.14.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_14/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.14.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_14/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.14.attention.output.dense.bias',
-    'bert/encoder/layer_14/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.14.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_14/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.14.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_14/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.14.intermediate.weight',
-    'bert/encoder/layer_14/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.14.intermediate.bias',
-    'bert/encoder/layer_14/output/dense/kernel': 'bert.bert.bert_encoder.layers.14.output.dense.weight',
-    'bert/encoder/layer_14/output/dense/bias': 'bert.bert.bert_encoder.layers.14.output.dense.bias',
-    'bert/encoder/layer_14/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.14.output.layernorm.gamma',
-    'bert/encoder/layer_14/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.14.output.layernorm.beta',
-    'bert/encoder/layer_15/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.15.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_15/attention/self/query/bias': 'bert.bert.bert_encoder.layers.15.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_15/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.15.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_15/attention/self/key/bias': 'bert.bert.bert_encoder.layers.15.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_15/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.15.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_15/attention/self/value/bias': 'bert.bert.bert_encoder.layers.15.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_15/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.15.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_15/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.15.attention.output.dense.bias',
-    'bert/encoder/layer_15/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.15.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_15/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.15.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_15/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.15.intermediate.weight',
-    'bert/encoder/layer_15/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.15.intermediate.bias',
-    'bert/encoder/layer_15/output/dense/kernel': 'bert.bert.bert_encoder.layers.15.output.dense.weight',
-    'bert/encoder/layer_15/output/dense/bias': 'bert.bert.bert_encoder.layers.15.output.dense.bias',
-    'bert/encoder/layer_15/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.15.output.layernorm.gamma',
-    'bert/encoder/layer_15/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.15.output.layernorm.beta',
-    'bert/encoder/layer_16/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.16.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_16/attention/self/query/bias': 'bert.bert.bert_encoder.layers.16.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_16/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.16.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_16/attention/self/key/bias': 'bert.bert.bert_encoder.layers.16.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_16/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.16.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_16/attention/self/value/bias': 'bert.bert.bert_encoder.layers.16.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_16/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.16.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_16/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.16.attention.output.dense.bias',
-    'bert/encoder/layer_16/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.16.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_16/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.16.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_16/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.16.intermediate.weight',
-    'bert/encoder/layer_16/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.16.intermediate.bias',
-    'bert/encoder/layer_16/output/dense/kernel': 'bert.bert.bert_encoder.layers.16.output.dense.weight',
-    'bert/encoder/layer_16/output/dense/bias': 'bert.bert.bert_encoder.layers.16.output.dense.bias',
-    'bert/encoder/layer_16/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.16.output.layernorm.gamma',
-    'bert/encoder/layer_16/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.16.output.layernorm.beta',
-    'bert/encoder/layer_17/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.17.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_17/attention/self/query/bias': 'bert.bert.bert_encoder.layers.17.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_17/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.17.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_17/attention/self/key/bias': 'bert.bert.bert_encoder.layers.17.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_17/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.17.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_17/attention/self/value/bias': 'bert.bert.bert_encoder.layers.17.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_17/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.17.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_17/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.17.attention.output.dense.bias',
-    'bert/encoder/layer_17/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.17.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_17/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.17.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_17/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.17.intermediate.weight',
-    'bert/encoder/layer_17/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.17.intermediate.bias',
-    'bert/encoder/layer_17/output/dense/kernel': 'bert.bert.bert_encoder.layers.17.output.dense.weight',
-    'bert/encoder/layer_17/output/dense/bias': 'bert.bert.bert_encoder.layers.17.output.dense.bias',
-    'bert/encoder/layer_17/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.17.output.layernorm.gamma',
-    'bert/encoder/layer_17/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.17.output.layernorm.beta',
-    'bert/encoder/layer_18/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.18.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_18/attention/self/query/bias': 'bert.bert.bert_encoder.layers.18.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_18/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.18.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_18/attention/self/key/bias': 'bert.bert.bert_encoder.layers.18.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_18/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.18.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_18/attention/self/value/bias': 'bert.bert.bert_encoder.layers.18.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_18/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.18.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_18/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.18.attention.output.dense.bias',
-    'bert/encoder/layer_18/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.18.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_18/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.18.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_18/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.18.intermediate.weight',
-    'bert/encoder/layer_18/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.18.intermediate.bias',
-    'bert/encoder/layer_18/output/dense/kernel': 'bert.bert.bert_encoder.layers.18.output.dense.weight',
-    'bert/encoder/layer_18/output/dense/bias': 'bert.bert.bert_encoder.layers.18.output.dense.bias',
-    'bert/encoder/layer_18/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.18.output.layernorm.gamma',
-    'bert/encoder/layer_18/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.18.output.layernorm.beta',
-    'bert/encoder/layer_19/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.19.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_19/attention/self/query/bias': 'bert.bert.bert_encoder.layers.19.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_19/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.19.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_19/attention/self/key/bias': 'bert.bert.bert_encoder.layers.19.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_19/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.19.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_19/attention/self/value/bias': 'bert.bert.bert_encoder.layers.19.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_19/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.19.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_19/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.19.attention.output.dense.bias',
-    'bert/encoder/layer_19/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.19.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_19/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.19.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_19/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.19.intermediate.weight',
-    'bert/encoder/layer_19/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.19.intermediate.bias',
-    'bert/encoder/layer_19/output/dense/kernel': 'bert.bert.bert_encoder.layers.19.output.dense.weight',
-    'bert/encoder/layer_19/output/dense/bias': 'bert.bert.bert_encoder.layers.19.output.dense.bias',
-    'bert/encoder/layer_19/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.19.output.layernorm.gamma',
-    'bert/encoder/layer_19/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.19.output.layernorm.beta',
-    'bert/encoder/layer_20/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.20.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_20/attention/self/query/bias': 'bert.bert.bert_encoder.layers.20.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_20/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.20.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_20/attention/self/key/bias': 'bert.bert.bert_encoder.layers.20.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_20/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.20.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_20/attention/self/value/bias': 'bert.bert.bert_encoder.layers.20.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_20/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.20.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_20/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.20.attention.output.dense.bias',
-    'bert/encoder/layer_20/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.20.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_20/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.20.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_20/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.20.intermediate.weight',
-    'bert/encoder/layer_20/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.20.intermediate.bias',
-    'bert/encoder/layer_20/output/dense/kernel': 'bert.bert.bert_encoder.layers.20.output.dense.weight',
-    'bert/encoder/layer_20/output/dense/bias': 'bert.bert.bert_encoder.layers.20.output.dense.bias',
-    'bert/encoder/layer_20/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.20.output.layernorm.gamma',
-    'bert/encoder/layer_20/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.20.output.layernorm.beta',
-    'bert/encoder/layer_21/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.21.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_21/attention/self/query/bias': 'bert.bert.bert_encoder.layers.21.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_21/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.21.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_21/attention/self/key/bias': 'bert.bert.bert_encoder.layers.21.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_21/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.21.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_21/attention/self/value/bias': 'bert.bert.bert_encoder.layers.21.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_21/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.21.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_21/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.21.attention.output.dense.bias',
-    'bert/encoder/layer_21/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.21.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_21/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.21.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_21/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.21.intermediate.weight',
-    'bert/encoder/layer_21/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.21.intermediate.bias',
-    'bert/encoder/layer_21/output/dense/kernel': 'bert.bert.bert_encoder.layers.21.output.dense.weight',
-    'bert/encoder/layer_21/output/dense/bias': 'bert.bert.bert_encoder.layers.21.output.dense.bias',
-    'bert/encoder/layer_21/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.21.output.layernorm.gamma',
-    'bert/encoder/layer_21/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.21.output.layernorm.beta',
-    'bert/encoder/layer_22/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.22.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_22/attention/self/query/bias': 'bert.bert.bert_encoder.layers.22.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_22/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.22.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_22/attention/self/key/bias': 'bert.bert.bert_encoder.layers.22.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_22/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.22.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_22/attention/self/value/bias': 'bert.bert.bert_encoder.layers.22.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_22/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.22.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_22/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.22.attention.output.dense.bias',
-    'bert/encoder/layer_22/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.22.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_22/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.22.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_22/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.22.intermediate.weight',
-    'bert/encoder/layer_22/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.22.intermediate.bias',
-    'bert/encoder/layer_22/output/dense/kernel': 'bert.bert.bert_encoder.layers.22.output.dense.weight',
-    'bert/encoder/layer_22/output/dense/bias': 'bert.bert.bert_encoder.layers.22.output.dense.bias',
-    'bert/encoder/layer_22/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.22.output.layernorm.gamma',
-    'bert/encoder/layer_22/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.22.output.layernorm.beta',
-    'bert/encoder/layer_23/attention/self/query/kernel': 'bert.bert.bert_encoder.layers.23.attention.attention'
-                                                         '.query_layer.weight',
-    'bert/encoder/layer_23/attention/self/query/bias': 'bert.bert.bert_encoder.layers.23.attention.attention'
-                                                       '.query_layer.bias',
-    'bert/encoder/layer_23/attention/self/key/kernel': 'bert.bert.bert_encoder.layers.23.attention.attention'
-                                                       '.key_layer.weight',
-    'bert/encoder/layer_23/attention/self/key/bias': 'bert.bert.bert_encoder.layers.23.attention.attention.key_layer'
-                                                     '.bias',
-    'bert/encoder/layer_23/attention/self/value/kernel': 'bert.bert.bert_encoder.layers.23.attention.attention'
-                                                         '.value_layer.weight',
-    'bert/encoder/layer_23/attention/self/value/bias': 'bert.bert.bert_encoder.layers.23.attention.attention'
-                                                       '.value_layer.bias',
-    'bert/encoder/layer_23/attention/output/dense/kernel': 'bert.bert.bert_encoder.layers.23.attention.output.dense'
-                                                           '.weight',
-    'bert/encoder/layer_23/attention/output/dense/bias': 'bert.bert.bert_encoder.layers.23.attention.output.dense.bias',
-    'bert/encoder/layer_23/attention/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.23.attention.output'
-                                                              '.layernorm.gamma',
-    'bert/encoder/layer_23/attention/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.23.attention.output'
-                                                             '.layernorm.beta',
-    'bert/encoder/layer_23/intermediate/dense/kernel': 'bert.bert.bert_encoder.layers.23.intermediate.weight',
-    'bert/encoder/layer_23/intermediate/dense/bias': 'bert.bert.bert_encoder.layers.23.intermediate.bias',
-    'bert/encoder/layer_23/output/dense/kernel': 'bert.bert.bert_encoder.layers.23.output.dense.weight',
-    'bert/encoder/layer_23/output/dense/bias': 'bert.bert.bert_encoder.layers.23.output.dense.bias',
-    'bert/encoder/layer_23/output/LayerNorm/gamma': 'bert.bert.bert_encoder.layers.23.output.layernorm.gamma',
-    'bert/encoder/layer_23/output/LayerNorm/beta': 'bert.bert.bert_encoder.layers.23.output.layernorm.beta',
-    'bert/pooler/dense/kernel': 'bert.bert.dense.weight',
-    'bert/pooler/dense/bias': 'bert.bert.dense.bias',
-    'cls/predictions/output_bias': 'bert.cls1.output_bias',
-    'cls/predictions/transform/dense/kernel': 'bert.cls1.dense.weight',
-    'cls/predictions/transform/dense/bias': 'bert.cls1.dense.bias',
-    'cls/predictions/transform/LayerNorm/gamma': 'bert.cls1.layernorm.gamma',
-    'cls/predictions/transform/LayerNorm/beta': 'bert.cls1.layernorm.beta',
-    'cls/seq_relationship/output_weights': 'bert.cls2.dense.weight',
-    'cls/seq_relationship/output_bias': 'bert.cls2.dense.bias',
-}
diff --git a/model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/ms_and_tf_checkpoint_transfer_tools.py b/model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/ms_and_tf_checkpoint_transfer_tools.py
deleted file mode 100644
index 93eb025f1..000000000
--- a/model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/ms_and_tf_checkpoint_transfer_tools.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""
-    mindspore and tensorflow checkpoint transfer tools
-"""
-
-import argparse
-import tensorflow as tf
-from mindspore.common.tensor import Tensor
-from mindspore.train.serialization import load_checkpoint, save_checkpoint
-from ms2tf_config import param_name_dict as ms2tf_param_dict
-
-
-def convert_ms_2_tf(tf_ckpt_path, ms_ckpt_path, new_ckpt_path):
-    """
-    convert ms checkpoint to tf checkpoint
-    """
-    # load MS checkpoint
-    ms_param_dict = load_checkpoint(ms_ckpt_path)
-    for name in ms_param_dict.keys():
-        if isinstance(ms_param_dict[name].data, Tensor):
-            ms_param_dict[name] = ms_param_dict[name].data.asnumpy()
-
-    convert_count = 0
-    with tf.Session() as sess:
-        # convert ms shape to tf
-        print("start convert parameter ...")
-        new_var_list = []
-        for var_name, shape in tf.contrib.framework.list_variables(tf_ckpt_path):
-            if var_name in ms2tf_param_dict:
-                ms_name = ms2tf_param_dict[var_name]
-
-                new_tensor = tf.convert_to_tensor(ms_param_dict[ms_name])
-                if len(shape) == 2:
-                    if tuple(shape) != new_tensor.shape or new_tensor.shape[0] == new_tensor.shape[1]:
-                        new_tensor = tf.transpose(new_tensor, (1, 0))
-                        if new_tensor.shape != tuple(shape):
-                            raise ValueError("shape is not matched after transpose!! {}, {}"
-                                             .format(str(new_tensor.shape), str(tuple(shape))))
-
-                if new_tensor.shape != tuple(shape):
-                    raise ValueError("shape is not matched after transpose!! {}, {}"
-                                     .format(str(new_tensor.shape), str(tuple(shape))))
-                var = tf.Variable(new_tensor, name=var_name)
-                convert_count = convert_count + 1
-            else:
-                var = tf.Variable(tf.contrib.framework.load_variable(tf_ckpt_path, var_name), name=var_name)
-            new_var_list.append(var)
-        print('convert value num: ', convert_count, " of ", len(ms2tf_param_dict))
-
-        # saving tf checkpoint
-        print("start saving ...")
-        saver = tf.train.Saver(var_list=new_var_list)
-        sess.run(tf.global_variables_initializer())
-        saver.save(sess, new_ckpt_path)
-        print("tf checkpoint was save in :", new_ckpt_path)
-
-    return True
-
-
-def convert_tf_2_ms(tf_ckpt_path, ms_ckpt_path, new_ckpt_path):
-    """
-    convert tf checkpoint to ms checkpoint
-    """
-    tf2ms_param_dict = dict(zip(ms2tf_param_dict.values(), ms2tf_param_dict.keys()))
-
-    # load MS checkpoint
-    ms_param_dict = load_checkpoint(ms_ckpt_path)
-
-    new_params_list = []
-    session = tf.compat.v1.Session()
-    count = 0
-    for ms_name in tf2ms_param_dict.keys():
-        count += 1
-        param_dict = {}
-
-        tf_name = tf2ms_param_dict[ms_name]
-        data = tf.train.load_variable(tf_ckpt_path, tf_name)
-        ms_shape = ms_param_dict[ms_name].data.shape
-        tf_shape = data.shape
-
-        if len(ms_shape) == 2:
-            if ms_shape != tf_shape or ms_shape[0] == ms_shape[1]:
-                data = tf.transpose(data, (1, 0))
-                data = data.eval(session=session)
-
-        param_dict['name'] = ms_name
-        param_dict['data'] = Tensor(data)
-
-        new_params_list.append(param_dict)
-    print("start saving checkpoint ...")
-    save_checkpoint(new_params_list, new_ckpt_path)
-    print("ms checkpoint was save in :", new_ckpt_path)
-
-    return True
-
-
-def main():
-    """
-    tf checkpoint transfer to ms or ms checkpoint transfer to tf
-    """
-    parser = argparse.ArgumentParser(description='checkpoint transfer.')
-    parser.add_argument("--tf_ckpt_path", type=str, default='./tf-bert/bs64k_32k_ckpt_model.ckpt-28252',
-                        help="TensorFlow checkpoint dir, default is: './tf-bert/bs64k_32k_ckpt_model.ckpt-28252'.")
-    parser.add_argument("--ms_ckpt_path", type=str, default='./ms-bert/large_en.ckpt',
-                        help="MindSpore checkpoint dir, default is: './ms-bert/large_en.ckpt'.")
-    parser.add_argument("--new_ckpt_path", type=str, default='./new_ckpt/new_bert_large_en.ckpt',
-                        help="New checkpoint dir, default is: './new_ckpt/new_bert_large_en.ckpt'.")
-    parser.add_argument("--transfer_option", type=str, default='ms2tf',
-                        help="option of transfer ms2tf or tf2ms, default is ms2tf.")
-
-    args_opt = parser.parse_args()
-
-    if args_opt.transfer_option == 'ms2tf':
-        print("start ms2tf option ...")
-        tf_ckpt_path = args_opt.tf_ckpt_path
-        ms_ckpt_path = args_opt.ms_ckpt_path
-        new_ckpt_path = args_opt.new_ckpt_path
-        convert_ms_2_tf(tf_ckpt_path, ms_ckpt_path, new_ckpt_path)
-    elif args_opt.transfer_option == 'tf2ms':
-        print("start tf2ms option ...")
-        tf_ckpt_path = args_opt.tf_ckpt_path
-        ms_ckpt_path = args_opt.ms_ckpt_path
-        new_ckpt_path = args_opt.new_ckpt_path
-        convert_tf_2_ms(tf_ckpt_path, ms_ckpt_path, new_ckpt_path)
-    else:
-        print("ERROR: '--transfer_option' please select 0 or 1")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/readme.md b/model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/readme.md
deleted file mode 100644
index 3b9add2d6..000000000
--- a/model_zoo/official/nlp/bert/scripts/ms_and_tf_checkpoint_transfer_tools/readme.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# Mindspore and Tensorflow checkpoint transfer tools
-
-# How to use
-## 1. For Mindspore to Tensorflow
-```
-python ms_and_tf_checkpoint_transfer_for_bert_large.py \
---transfer_option='ms2tf' \
---ms_ckpt_path='/data/ms-bert/checkpoint_bert-1500_100.ckpt' \
---tf_ckpt_path='/data/tf-bert/bs64k_32k_ckpt_model.ckpt' \
---new_ckpt_path='/data/ms2tf/tf_bert_large_1500-100.ckpt'
-```
-## 2. For Tensorflow to Mindspore
-```
-python ms_and_tf_checkpoint_transfer_for_bert_large.py \
---transfer_option='tf2ms' \
---tf_ckpt_path='/data/tf-bert/tf_bert_large_1500-100.ckpt' \
---ms_ckpt_path='/data/ms-bert/checkpoint_bert-1500_100.ckpt' \
---new_ckpt_path='/data/tf2ms/ms_bert_large_1500-100.ckpt'
-```
-
-# Note
-Please note that both tf2ms and ms2tf require two inputs, one output, one of the two inputs is the checkpoint to be converted, and the other is the target checkpoint to be referred to. Because there are many types of bert models, the meaning of the target checkpoint is to prevent you from using different checkpoints for conversion errors.
\ No newline at end of file
diff --git a/model_zoo/official/nlp/bert/scripts/run_classifier.sh b/model_zoo/official/nlp/bert/scripts/run_classifier.sh
index 39516fa41..b01f1f29e 100644
--- a/model_zoo/official/nlp/bert/scripts/run_classifier.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_classifier.sh
@@ -41,4 +41,4 @@ python ${PROJECT_DIR}/../run_classifier.py  \
     --load_finetune_checkpoint_path="" \
     --train_data_file_path="" \
     --eval_data_file_path="" \
-    --schema_file_path="" > log.txt 2>&1 &
+    --schema_file_path="" > classfifier_log.txt 2>&1 &
diff --git a/model_zoo/official/nlp/bert/scripts/run_distribute_pretrain.sh b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh
similarity index 90%
rename from model_zoo/official/nlp/bert/scripts/run_distribute_pretrain.sh
rename to model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh
index be910fb84..1f7309a24 100644
--- a/model_zoo/official/nlp/bert/scripts/run_distribute_pretrain.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh
@@ -16,8 +16,8 @@
 
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "bash run_distribute_pretrain.sh DATA_DIR RANK_TABLE_FILE"
-echo "for example: bash run_distribute_pretrain.sh /path/dataset /path/hccl.json"
+echo "bash run_distributed_pretrain.sh DATA_DIR RANK_TABLE_FILE"
+echo "for example: bash run_distributed_pretrain.sh /path/dataset /path/hccl.json"
 echo "It is better to use absolute path."
 echo "For hyper parameter, please note that you should customize the scripts:
           '{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' "
diff --git a/model_zoo/official/nlp/bert/scripts/run_distribute_pretrain_for_gpu.sh b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh
similarity index 89%
rename from model_zoo/official/nlp/bert/scripts/run_distribute_pretrain_for_gpu.sh
rename to model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh
index 8deff766b..7ca644652 100644
--- a/model_zoo/official/nlp/bert/scripts/run_distribute_pretrain_for_gpu.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh
@@ -16,8 +16,8 @@
 
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR"
-echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json"
+echo "bash run_distributed_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR"
+echo "for example: bash run_distributed_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
 
diff --git a/model_zoo/official/nlp/bert/scripts/run_ner.sh b/model_zoo/official/nlp/bert/scripts/run_ner.sh
index 45c37be65..fc3f8328f 100644
--- a/model_zoo/official/nlp/bert/scripts/run_ner.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_ner.sh
@@ -44,4 +44,4 @@ python ${PROJECT_DIR}/../run_ner.py  \
     --load_finetune_checkpoint_path="" \
     --train_data_file_path="" \
     --eval_data_file_path="" \
-    --schema_file_path="" > log.txt 2>&1 &
+    --schema_file_path="" > ner_log.txt 2>&1 &
diff --git a/model_zoo/official/nlp/bert/scripts/run_squad.sh b/model_zoo/official/nlp/bert/scripts/run_squad.sh
index efca61db1..7c68be626 100644
--- a/model_zoo/official/nlp/bert/scripts/run_squad.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_squad.sh
@@ -42,4 +42,4 @@ python ${PROJECT_DIR}/../run_squad.py  \
     --load_finetune_checkpoint_path="" \
     --train_data_file_path="" \
     --eval_data_file_path="" \
-    --schema_file_path="" > log.txt 2>&1 &
+    --schema_file_path="" > squad_log.txt 2>&1 &
diff --git a/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain.sh b/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain_ascend.sh
similarity index 96%
rename from model_zoo/official/nlp/bert/scripts/run_standalone_pretrain.sh
rename to model_zoo/official/nlp/bert/scripts/run_standalone_pretrain_ascend.sh
index f59eb6960..c6048f4f6 100644
--- a/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain_ascend.sh
@@ -43,4 +43,4 @@ python ${PROJECT_DIR}/../run_pretrain.py  \
     --save_checkpoint_steps=10000 \
     --save_checkpoint_num=1 \
     --data_dir=$DATA_DIR \
-    --schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
+    --schema_dir=$SCHEMA_DIR > pretraining_log.txt 2>&1 &
diff --git a/model_zoo/official/nlp/tinybert/README.md b/model_zoo/official/nlp/tinybert/README.md
index 13cfa3d77..2d758ef85 100644
--- a/model_zoo/official/nlp/tinybert/README.md
+++ b/model_zoo/official/nlp/tinybert/README.md
@@ -1,83 +1,158 @@
-# TinyBERT Example
-## Description
-[TinyBERT](https://github.com/huawei-noah/Pretrained-Model/tree/master/TinyBERT) is 7.5x smalller and 9.4x faster on inference than [BERT-base](https://github.com/google-research/bert) (the base version of BERT model) and achieves competitive performances in the tasks of natural language understanding. It performs a novel transformer distillation at both the pre-training and task-specific learning stages.
+# Contents
+- [TinyBERT Description](#tinybert-description)
+- [Model Architecture](#model-architecture)
+- [Dataset](#dataset)
+- [Environment Requirements](#environment-requirements)
+- [Quick Start](#quick-start)
+- [Script Description](#script-description)
+    - [Script and Sample Code](#script-and-sample-code)
+    - [Script Parameters](#script-parameters)
+    - [Dataset Preparation](#dataset-preparation)
+    - [Training Process](#training-process)
+    - [Evaluation Process](#evaluation-process)
+- [Model Description](#model-description)
+    - [Performance](#performance)
+        - [Training Performance](#training-performance)
+        - [Evaluation Performance](#evaluation-performance)
+- [Description of Random Situation](#description-of-random-situation)
+- [ModelZoo Homepage](#modelzoo-homepage)
 
-## Requirements
-- Install [MindSpore](https://www.mindspore.cn/install/en).
-- Download dataset for general distill and task distill such as GLUE.
-- Prepare a pre-trained bert model and a fine-tuned bert model for specific task such as GLUE.
+# [TinyBERT Description](#contents)
+[TinyBERT](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT) is 7.5x smalller and 9.4x faster on inference than [BERT-base](https://github.com/google-research/bert) (the base version of BERT model) and achieves competitive performances in the tasks of natural language understanding. It performs a novel transformer distillation at both the pre-training and task-specific learning stages.
 
-## Running the Example
-### General Distill
-- Set options in `src/gd_config.py`, including lossscale, optimizer and network.
+[Paper](https://arxiv.org/abs/1909.10351):  Xiaoqi Jiao, Yichun Yin, Lifeng Shang, Xin Jiang, Xiao Chen, Linlin Li, Fang Wang, Qun Liu. [TinyBERT: Distilling BERT for Natural Language Understanding](https://arxiv.org/abs/1909.10351). arXiv preprint arXiv:1909.10351. 
 
-- Set options in `scripts/run_standalone_gd.sh`, including device target, data sink config, checkpoint config and dataset. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file.
+# [Model Architecture](#contents)
+The backbone structure of TinyBERT is transformer, the transformer contains four encoder modules, one encoder contains one selfattention module and one selfattention module contains one attention module.  
 
-- Run `run_standalone_gd.sh` for non-distributed general distill of BERT-base model.
+# [Dataset](#contents)
+- Download the zhwiki or enwiki dataset for general distillation. Extract and clean text in the dataset with [WikiExtractor](https://github.com/attardi/wikiextractor). Convert the dataset to TFRecord format, please refer to create_pretraining_data.py which in [BERT](https://github.com/google-research/bert) repository.
+- Download glue dataset for task distillation. Convert dataset files from json format to tfrecord format, please refer to run_classifier.py which in [BERT](https://github.com/google-research/bert) repository.
 
-    ``` bash
-    bash scripts/run_standalone_gd.sh
-    ```
-- Run `run_distribute_gd.sh` for distributed general distill of BERT-base model.
+# [Environment Requirements](#contents)
+- Hardware（Ascend）
+  - Prepare hardware environment with Ascend processor. If you want to try Ascend, please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources. 
+- Framework
+  - [MindSpore](https://gitee.com/mindspore/mindspore)
+- For more information, please check the resources below：
+  - [MindSpore tutorials](https://www.mindspore.cn/tutorial/en/master/index.html) 
+  - [MindSpore API](https://www.mindspore.cn/api/en/master/index.html)
 
-    ``` bash
-    bash scripts/run_distribute_gd.sh DEVICE_NUM EPOCH_SIZE RANK_TABLE_FILE
-    ```  
+# [Quick Start](#contents)
+After installing MindSpore via the official website, you can start general distill, task distill and evaluation as follows:
+```bash
+# run standalone general distill example
+bash scripts/run_standalone_gd_ascend.sh 
 
-### Task Distill
-Task distill has two phases, pre-distill and task distill.
-- Set options in `src/td_config.py`, including lossscale, optimizer config of phase 1 and 2, as well as network config.
+Before running the shell script, please set the `load_teacher_ckpt_path`, `data_dir` and `schema_dir` in the run_standalone_gd_ascend.sh file first.
+
+# run distributed general distill example
+bash scripts/run_distributed_gd_ascend.sh 8 1 /path/hccl.json
+
+Before running the shell script, please set the `load_teacher_ckpt_path`, `data_dir` and `schema_dir` in the run_distributed_gd_ascend.sh file first.
+
+# run task distill and evaluation example
+bash scripts/run_standalone_td_ascend.sh 
+
+Before running the shell script, please set the `task_name`, `load_teacher_ckpt_path`, `load_gd_ckpt_path`, `train_data_dir`, `eval_data_dir` and `schema_dir` in the run_standalone_td_ascend.sh file first.
+```
 
-- Run `run_standalone_td.py` for task distill of BERT-base model.
+For distributed training, a hccl configuration file with JSON format needs to be created in advance.
+Please follow the instructions in the link below:
+https:gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools.
 
-    ```bash
-    bash scripts/run_standalone_td.sh
-    ```
+# [Script Description](#contents)
+## [Script and Sample Code](#contents)
 
-## Usage
+```shell
+.
+└─bert
+  ├─README.md
+  ├─scripts
+    ├─run_distributed_gd_ascend.sh       # shell script for distributed general distill phase
+    ├─run_distributed_gd_for_gpu.sh      # shell script for distributed general distill phase
+    ├─run_standalone_gd_ascend.sh        # shell script for standalone general distill phase
+    ├─run_standalone_td_ascend.sh        # shell script for standalone task distill phase
+  ├─src
+    ├─__init__.py
+    ├─assessment_method.py               # assessment method for evaluation
+    ├─dataset.py                         # data processing
+    ├─fused_layer_norm.py                # Layernormal is optimized for Ascend
+    ├─gd_config.py                       # parameter configuration for general distill phase
+    ├─td_config.py                       # parameter configuration for task distill phase
+    ├─tinybert_for_gd_td.py              # backbone code of network
+    ├─tinybert_model.py                  # backbone code of network
+    ├─utils.py                           # util function
+  ├─__init__.py
+  ├─run_general_distill.py               # train net for general distillation 
+  ├─run_task_distill.py                  # train and eval net for task distillation 
+```
+
+## [Script Parameters](#contents)
 ### General Distill
 ``` 
-usage: run_standalone_gd.py  [--distribute DISTRIBUTE] [--device_target DEVICE_TARGET]
-                             [--epoch_size N] [--device_id N]
-                             [--enable_data_sink ENABLE_DATA_SINK] [--data_sink_steps N]
-                             [--save_checkpoint_steps N] [--max_ckpt_num N]
-                             [--load_teacher_ckpt_path LOAD_TEACHER_CKPT_PATH]
-                             [--data_dir DATA_DIR] [--schema_dir SCHEMA_DIR]
+usage: run_general_distill_ascend.py  [--distribute DISTRIBUTE] [--epoch_size N] [----device_num N] [--device_id N] 
+                                      [--device_target DEVICE_TARGET] [--do_shuffle DO_SHUFFLE]
+                                      [--enable_data_sink ENABLE_DATA_SINK] [--data_sink_steps N] 
+                                      [--save_ckpt_path SAVE_CKPT_PATH]
+                                      [--load_teacher_ckpt_path LOAD_TEACHER_CKPT_PATH]
+                                      [--save_checkpoint_step N] [--max_ckpt_num N] 
+                                      [--data_dir DATA_DIR] [--schema_dir SCHEMA_DIR] [train_steps N]
 
 options:
-    --distribute               whether to run distributely: "true" | "false"
-    --device_target            targeted device to run task: "Ascend" | "GPU"
+    --device_target            device where the code will be implemented: "Ascend" | "GPU", default is "Ascend"
+    --distribute               pre_training by serveral devices: "true"(training by more than 1 device) | "false", default is "false"
     --epoch_size               epoch size: N, default is 1
     --device_id                device id: N, default is 0
+    --device_num               number of used devices: N, default is 1
+    --save_ckpt_path           path to save checkpoint files: PATH, default is ""    
+    --max_ckpt_num             max number for saving checkpoint files: N, default is 1
+    --do_shuffle               enable shuffle: "true" | "false", default is "true"
     --enable_data_sink         enable data sink: "true" | "false", default is "true"
     --data_sink_steps          set data sink steps: N, default is 1
-    --load_teacher_ckpt_path   path of teacher checkpoint to load: PATH, default is ""
+    --save_checkpoint_step     steps for saving checkpoint files: N, default is 1000
+    --load_teacher_ckpt_path   path to load teacher checkpoint files: PATH, default is ""
     --data_dir                 path to dataset directory: PATH, default is ""
     --schema_dir               path to schema.json file, PATH, default is ""
-
-usage: run_distribute_gd.py  [--distribute DISTRIBUTE] [--device_target DEVICE_TARGET]
-                             [--epoch_size N] [--device_id N] [--device_num N]
-                             [--enable_data_sink ENABLE_DATA_SINK] [--data_sink_steps N]
-                             [--save_ckpt_steps N] [--max_ckpt_num N]
-                             [--load_teacher_ckpt_path LOAD_TEACHER_CKPT_PATH]
-                             [--data_dir DATA_DIR] [--schema_dir SCHEMA_DIR]
+```
+  
+### Task Distill
+``` 
+usage: run_general_task_ascend.py  [--device_target DEVICE_TARGET] [--do_train DO_TRAIN] [--do_eval DO_EVAL] 
+                                   [--td_phase1_epoch_size N] [--td_phase2_epoch_size N] 
+                                   [--device_id N] [--do_shuffle DO_SHUFFLE]
+                                   [--enable_data_sink ENABLE_DATA_SINK] [--save_ckpt_step N] 
+                                   [--max_ckpt_num N] [--data_sink_steps N] 
+                                   [--load_teacher_ckpt_path LOAD_TEACHER_CKPT_PATH]
+                                   [--load_gd_ckpt_path LOAD_GD_CKPT_PATH]
+                                   [--load_td1_ckpt_path LOAD_TD1_CKPT_PATH]
+                                   [--train_data_dir TRAIN_DATA_DIR]
+                                   [--eval_data_dir EVAL_DATA_DIR]
+                                   [--task_name TASK_NAME] [--schema_dir SCHEMA_DIR]
 
 options:
-    --distribute               whether to run distributely: "true" | "false"
-    --device_target            targeted device to run task: "Ascend" | "GPU"
-    --epoch_size               epoch size: N, default is 1
+    --device_target            device where the code will be implemented: "Ascend" | "GPU", default is "Ascend"
+    --do_train                 enable train task: "true" | "false", default is "true"
+    --do_eval                  enable eval task: "true" | "false", default is "true"
+    --td_phase1_epoch_size     epoch size for td phase1: N, default is 10
+    --td_phase2_epoch_size     epoch size for td phase2: N, default is 3
     --device_id                device id: N, default is 0
-    --device_num               device id to run task
-    --enable_data_sink         enable data sink: "true" | "false", default is "true"
+    --do_shuffle               enable shuffle: "true" | "false", default is "true"    
+    --enable_data_sink         enable data sink: "true" | "false", default is "true"    
+    --save_ckpt_step           steps for saving checkpoint files: N, default is 1000
+    --max_ckpt_num             max number for saving checkpoint files: N, default is 1
     --data_sink_steps          set data sink steps: N, default is 1
-    --load_teacher_ckpt_path   path of teacher checkpoint to load: PATH, default is ""
-    --data_dir                 path to dataset directory: PATH, default is ""
+    --load_teacher_ckpt_path   path to load teacher checkpoint files: PATH, default is ""
+    --load_gd_ckpt_path        path to load checkpoint files which produced by general distill: PATH, default is ""
+    --load_td1_ckpt_path       path to load checkpoint files which produced by task distill phase 1: PATH, default is ""
+    --train_data_dir           path to train dataset directory: PATH, default is ""
+    --eval_data_dir            path to eval dataset directory: PATH, default is ""
+    --task_name                classification task: "SST-2" | "QNLI" | "MNLI", default is ""
     --schema_dir               path to schema.json file, PATH, default is ""
-
 ```
 
 ## Options and Parameters
-`gd_config.py` and `td_config.py` Contain parameters of BERT model and options for optimizer and lossscale.
+`gd_config.py` and `td_config.py` contain parameters of BERT model and options for optimizer and lossscale.
 ### Options:
 ```
 Parameters for lossscale:
@@ -85,12 +160,12 @@ Parameters for lossscale:
     scale_factor                    factor used to update loss scale: N, default is 2
     scale_window                    steps for once updatation of loss scale: N, default is 50 
 
-Parameters for task-specific config:
-    load_teacher_ckpt_path          teacher checkpoint to load
-    load_student_ckpt_path          student checkpoint to load
-    data_dir                        training data dir
-    eval_data_dir                   evaluation data dir
-    schema_dir                      data schema path
+Parameters for optimizer:
+    learning_rate                   value of learning rate: Q
+    end_learning_rate               value of end learning rate: Q, must be positive
+    power                           power: Q
+    weight_decay                    weight decay: Q
+    eps                             term added to the denominator to improve numerical stability: Q
 ```
 
 ### Parameters:
@@ -117,13 +192,128 @@ Parameters for bert network:
     dtype                           data type of input: mstype.float16 | mstype.float32, default is mstype.float32
     compute_type                    compute type in BertTransformer: mstype.float16 | mstype.float32, default is mstype.float16
     enable_fused_layernorm          use batchnorm instead of layernorm to improve performance, default is False
+```
+## [Training Process](#contents)
+### Training
+#### running on Ascend
+Before running the command below, please check `load_teacher_ckpt_path`, `data_dir` and `schma_dir` has been set. Please set the path to be the absolute full path, e.g:"/username/checkpoint_100_300.ckpt".
+```
+bash scripts/run_standalone_gd_ascend.sh
+```
+The command above will run in the background, you can view the results the file log.txt. After training, you will get some checkpoint files under the script folder by default. The loss value will be achieved as follows:
+```
+# grep "epoch" log.txt
+epoch: 1, step: 100, outpus are (Tensor(shape=[1], dtype=Float32, 28.2093), Tensor(shape=[], dtype=Bool, False), Tensor(shape=[], dtype=Float32, 65536))
+epoch: 2, step: 200, outpus are (Tensor(shape=[1], dtype=Float32, 30.1724), Tensor(shape=[], dtype=Bool, False), Tensor(shape=[], dtype=Float32, 65536))
+...
+```
 
-Parameters for optimizer:
-    optimizer                       optimizer used in the network: AdamWeightDecay
-    learning_rate                   value of learning rate: Q
-    end_learning_rate               value of end learning rate: Q, must be positive
-    power                           power: Q
-    weight_decay                    weight decay: Q
-    eps                             term added to the denominator to improve numerical stability: Q
+### Distributed Training
+#### running on Ascend
+Before running the command below, please check `load_teacher_ckpt_path`, `data_dir` and `schma_dir` has been set. Please set the path to be the absolute full path, e.g:"/username/checkpoint_100_300.ckpt".
+```
+bash scripts/run_distributed_gd_ascend.sh 8 1 /path/hccl.json
+```
+The command above will run in the background, you can view the results the file log.txt. After training, you will get some checkpoint files under the LOG* folder by default. The loss value will be achieved as follows:
+```
+# grep "epoch" LOG*/log.txt
+epoch: 1, step: 100, outpus are (Tensor(shape=[1], dtype=Float32, 28.1478), Tensor(shape=[], dtype=Bool, False), Tensor(shape=[], dtype=Float32, 65536))
+...
+epoch: 1, step: 100, outpus are (Tensor(shape=[1], dtype=Float32, 30.5901), Tensor(shape=[], dtype=Bool, False), Tensor(shape=[], dtype=Float32, 65536))
+...
+```
+
+## [Evaluation Process](#contents)
+### Evaluation
+If you want to after running and continue to eval, please set `do_train=true` and `do_eval=true`, If you want to run eval alone, please set `do_train=false` and `do_eval=true`.
+#### evaluation on SST-2 dataset when running on Ascend   
+```
+bash scripts/run_standalone_td_ascend.sh
+```
+The command above will run in the background, you can view the results the file log.txt. The accuracy of the test dataset will be as follows:   
+```bash
+# grep "The best acc" log.txt
+The best acc is 0.872685
+The best acc is 0.893515
+The best acc is 0.899305
+...
+The best acc is 0.902777
+...
+```
+#### evaluation on MNLI dataset when running on Ascend
+Before running the command below, please check the load pretrain checkpoint path has been set. Please set the checkpoint path to be the absolute full path, e.g:"/username/pretrain/checkpoint_100_300.ckpt".
 ```
+bash scripts/run_standalone_td_ascend.sh
+```
+The command above will run in the background, you can view the results the file log.txt. The accuracy of the test dataset will be as follows:   
+```
+# grep "The best acc" log.txt
+The best acc is 0.803206
+The best acc is 0.803308
+The best acc is 0.810355
+...
+The best acc is 0.813929
+...
+```
+#### evaluation on QNLI dataset when running on Ascend
+Before running the command below, please check the load pretrain checkpoint path has been set. Please set the checkpoint path to be the absolute full path, e.g:"/username/pretrain/checkpoint_100_300.ckpt".
+```
+bash scripts/run_standalone_td_ascend.sh
+```
+The command above will run in the background, you can view the results the file log.txt. The accuracy of the test dataset will be as follows:   
+```
+# grep "The best acc" log.txt
+The best acc is 0.870772
+The best acc is 0.871691
+The best acc is 0.875183
+...
+The best acc is 0.891176
+...
+```
+    
+## [Model Description](#contents)
+## [Performance](#contents)
+### training Performance
+| Parameters                 | TinyBERT                                                   | TinyBERT                  |
+| -------------------------- | ---------------------------------------------------------- | ------------------------- |
+| Model Version              |                                                            |                           |
+| Resource                   | Ascend 910, cpu:2.60GHz 56cores, memory:314G               | NV SMX2 V100-32G          |
+| uploaded Date              | 08/20/2020                                                 | 05/06/2020                |
+| MindSpore Version          | 0.6.0                                                      | 0.3.0                     |
+| Dataset                    | cn-wiki-128                                                | ImageNet                  |
+| Training Parameters        | src/gd_config.py                                           | src/config.py             |
+| Optimizer                  | AdamWeightDecay                                            | AdamWeightDecay           |
+| Loss Function              | SoftmaxCrossEntropy                                        | SoftmaxCrossEntropy       |
+| outputs                    | probability                                                |                           |
+| Loss                       | 6.541583                                                   | 1.913                     |
+| Speed                      | 35.4ms/step                                                |                      |
+| Total time                 | 17.3h                                                      |                           |
+| Params (M)                 | 15M                                                        |                           |
+| Checkpoint for task distill| 74M(.ckpt file)                                            |                           |    
+
+#### Inference Performance
+
+| Parameters                 |                               |                           |                      |
+| -------------------------- | ----------------------------- | ------------------------- | -------------------- |
+| Model Version              |                               |                           |                      |
+| Resource                   | Huawei 910                    | NV SMX2 V100-32G          | Huawei 310           |
+| uploaded Date              | 08/20/2020                    | 05/22/2020                |                      |
+| MindSpore Version          | 0.6.0                         | 0.2.0                     | 0.2.0                | 
+| Dataset                    | SST-2,                        | ImageNet, 1.2W            | ImageNet, 1.2W       |
+| batch_size                 | 32                            | 130(8P)                   |                      |
+| Accuracy                   | 0.902777                      | ACC1[72.07%] ACC5[90.90%] |                      |
+| Speed                      |                               |                           |                      |
+| Total time                 |                               |                           |                      |
+| Model for inference        | 74M(.ckpt file)               |                           |                      |
+
+# [Description of Random Situation](#contents)
+
+In run_standaloned_td.sh, we set do_shuffle to shuffle the dataset. 
+
+In gd_config.py and td_config.py, we set the hidden_dropout_prob and attention_pros_dropout_prob to dropout some network node.
+
+In run_general_distill.py, we set the random seed to make sure distribute training has the same init weight.
 
+# [ModelZoo Homepage](#contents)
+ 
+Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo). 
\ No newline at end of file
diff --git a/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd.sh b/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_ascend.sh
similarity index 93%
rename from model_zoo/official/nlp/tinybert/scripts/run_distribute_gd.sh
rename to model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_ascend.sh
index e4c15ebf9..79762a7bc 100644
--- a/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd.sh
+++ b/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_ascend.sh
@@ -16,8 +16,8 @@
 
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "bash scripts/run_distribute_gd.sh DEVICE_NUM EPOCH_SIZE RANK_TABLE_FILE"
-echo "for example: bash scripts/run_distribute_gd.sh 8 40 /path/hccl.json"
+echo "bash scripts/run_distributed_gd.sh DEVICE_NUM EPOCH_SIZE RANK_TABLE_FILE"
+echo "for example: bash scripts/run_distributed_gd.sh 8 40 /path/hccl.json"
 echo "It is better to use absolute path."
 echo "running....... please see details by LOG{}/log.txt"
 echo "=============================================================================================================="
diff --git a/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd_for_gpu.sh b/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_for_gpu.sh
similarity index 86%
rename from model_zoo/official/nlp/tinybert/scripts/run_distribute_gd_for_gpu.sh
rename to model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_for_gpu.sh
index a12b58f98..0fa62824e 100644
--- a/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd_for_gpu.sh
+++ b/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_for_gpu.sh
@@ -16,8 +16,8 @@
 
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "bash run_distribute_gd_for_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR TEACHER_CKPT_PATH"
-echo "for example: bash run_distribute_gd_for_gpu.sh 8 3 /path/data/ /path/datasetSchema.json /path/bert_base.ckpt"
+echo "bash run_distributed_gd_for_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR TEACHER_CKPT_PATH"
+echo "for example: bash run_distributed_gd_for_gpu.sh 8 3 /path/data/ /path/datasetSchema.json /path/bert_base.ckpt"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
 
diff --git a/model_zoo/official/nlp/tinybert/scripts/run_standalone_gd.sh b/model_zoo/official/nlp/tinybert/scripts/run_standalone_gd_ascend.sh
similarity index 100%
rename from model_zoo/official/nlp/tinybert/scripts/run_standalone_gd.sh
rename to model_zoo/official/nlp/tinybert/scripts/run_standalone_gd_ascend.sh
diff --git a/model_zoo/official/nlp/tinybert/scripts/run_standalone_td.sh b/model_zoo/official/nlp/tinybert/scripts/run_standalone_td_ascend.sh
similarity index 100%
rename from model_zoo/official/nlp/tinybert/scripts/run_standalone_td.sh
rename to model_zoo/official/nlp/tinybert/scripts/run_standalone_td_ascend.sh
-- 
GitLab