fix bug of bert_thor

d24ce34c · wangmin · 36977394 · d24ce34c · d24ce34c · d24ce34c
5 changed file
--- a/model_zoo/official/nlp/bert_thor/README.md
+++ b/model_zoo/official/nlp/bert_thor/README.md
@@ -201,7 +201,7 @@ step:  3000 Accuracy:  [0.71377236]
 | Loss Function              | Softmax Cross Entropy                                       |
 | outputs                    | probability                                                 |
 | Loss                       |1.5654222                                                   |
-| Speed                      | 269ms/step（8pcs）                     |
+| Speed                      | 275ms/step（8pcs）                     |
 | Total time                 | 14 mins                          |
 | Parameters (M)             | 330                                                       |
 | Checkpoint for Fine tuning | 4.5G(.ckpt file)                                         |

--- a/model_zoo/official/nlp/bert_thor/pretrain_eval.py
+++ b/model_zoo/official/nlp/bert_thor/pretrain_eval.py
@@ -155,10 +155,11 @@ def MLM_eval():
    res = net.eval(dataset, dataset_sink_mode=False)
    print("==============================================================")
    for _, v in res.items():
-        print("Accuracy is: ")
-        print(v)
+        print("Accuracy is: ", v)
    print("==============================================================")


 if __name__ == "__main__":
+    DEVICE_ID = 1
+    os.environ['DEVICE_ID'] = str(DEVICE_ID)
    MLM_eval()
--- a/model_zoo/official/nlp/bert_thor/run_pretrain.py
+++ b/model_zoo/official/nlp/bert_thor/run_pretrain.py
@@ -26,7 +26,6 @@ from src.config import cfg
 from src.dataset import create_bert_dataset
 from src.lr_generator import get_bert_lr, get_bert_damping
 from src.model_thor import Model
-from src.thor_for_bert_arg import THOR
 from src.utils import LossCallBack, BertLearningRate
 import mindspore.common.dtype as mstype
 import mindspore.communication.management as D
@@ -66,10 +65,15 @@ def run_pretrain():
    parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
+    if args_opt.distribute == "true":
+        from src.thor_for_bert_arg import THOR
+    else:
+        from src.thor_for_bert import THOR
    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target,
                        device_id=args_opt.device_id, save_graphs=False)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(variable_memory_max_size="30GB")
+    context.set_context(max_call_depth=3000)
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':

--- a/model_zoo/official/nlp/bert_thor/src/bert_model.py
+++ b/model_zoo/official/nlp/bert_thor/src/bert_model.py
@@ -231,16 +231,17 @@ class EmbeddingPostprocessor(nn.Cell):
            frequency=frequency)
        self.position_ids = Tensor(np.arange(seq).reshape(-1, seq).astype(np.int32))
        self.layernorm = nn.LayerNorm((embedding_size,))
+        self.add = P.TensorAdd()

    def construct(self, token_type_ids, word_embeddings):
        """construct of EmbeddingPostprocessor"""
        output = word_embeddings
        if self.use_token_type:
            token_type_embeddings, _ = self.token_type_embedding(token_type_ids)
-            output += token_type_embeddings
+            output = self.add(output, token_type_embeddings)
        if not self.use_relative_positions:
            position_embeddings, _ = self.full_position_embedding(self.position_ids)
-            output += position_embeddings
+            output = self.add(output, position_embeddings)
        output = self.layernorm(output)
        output = self.dropout(output)
        return output

--- a/model_zoo/official/nlp/bert_thor/src/fused_layer_norm.py
+++ b/model_zoo/official/nlp/bert_thor/src/fused_layer_norm.py
@@ -101,6 +101,8 @@ class FusedLayerNorm(Cell):

        self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
        self.use_batch_norm = use_batch_norm
+        self.mul = P.Mul()
+        self.add = P.TensorAdd()

    def construct(self, input_x):
        """construct of FusedLayerNorm"""
@@ -112,7 +114,8 @@ class FusedLayerNorm(Cell):
            input_x = F.reshape(input_x, norm_shape)
            output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
            output = F.reshape(output, shape_x)
-            y = output * self.gamma + self.beta
+            y = self.mul(output, self.gamma)
+            y = self.add(y, self.beta)
        else:
            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
        return y