debug for machine down because of out of memory when global shuffle level was set for large dataset

8d927957 · shibeiji · 9991df86 · 8d927957 · 8d927957
隐藏空白更改
内联并排

Showing with 3 addition and 3 deletion

mindspore/nn/optim/adam.py mindspore/nn/optim/adam.py +1 -1

model_zoo/bert/src/dataset.py model_zoo/bert/src/dataset.py +2 -2

未找到文件。
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -388,7 +388,7 @@ class AdamWeightDecayDynamicLR(Optimizer):
                 beta2=0.999,
                 eps=1e-6,
                 weight_decay=0.0,
-                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
+                 decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()):
        super(AdamWeightDecayDynamicLR, self).__init__(0.0, params)
        if self.is_group:
            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")

--- a/model_zoo/bert/src/dataset.py
+++ b/model_zoo/bert/src/dataset.py
@@ -36,8 +36,8 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", e
    ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
                            columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
                                          "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
-                            shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank,
+                            shuffle=de.Shuffle.FILES if do_shuffle == "true" else False,
-                            shard_equal_rows=True)
+                            num_shards=device_num, shard_id=rank, shard_equal_rows=True)
    ori_dataset_size = ds.get_dataset_size()
    print('origin dataset size: ', ori_dataset_size)
    new_size = ori_dataset_size