bug fixed

142305ef · u010280923 · c0450690 · 142305ef
显示空白变更内容
内联并排

Showing with 2 addition and 4 deletion

train_rm.py train_rm.py +2 -4

未找到文件。
--- a/train_rm.py
+++ b/train_rm.py
@@ -255,10 +255,8 @@ if __name__ == "__main__":
                print(f"{str(shape[0]).ljust(5)}       {n}")

    if "deepspeed" in args.strategy:
-        del trainer.strategy.config["zero_optimization"]["allgather_bucket_size"]
-        del trainer.strategy.config["zero_optimization"]["reduce_bucket_size"]
-        # trainer.strategy.config["zero_optimization"]["allgather_bucket_size"] = args.ds_bucket_mb * 1000 * 1000
-        # trainer.strategy.config["zero_optimization"]["reduce_bucket_size"] = args.ds_bucket_mb * 1000 * 1000
+        trainer.strategy.config["zero_optimization"]["allgather_bucket_size"] = args.ds_bucket_mb * 1000 * 1000
+        trainer.strategy.config["zero_optimization"]["reduce_bucket_size"] = args.ds_bucket_mb * 1000 * 1000

    # must set shuffle=True, persistent_workers=False (because worker is in another thread)
    data_loader = DataLoader(train_data, shuffle=True, pin_memory=True, batch_size=args.micro_bsz, num_workers=1, persistent_workers=False, drop_last=True)