diff --git a/tests/model/BingBertSquad/test_e2e_squad.py b/tests/model/BingBertSquad/test_e2e_squad.py new file mode 100644 index 0000000000000000000000000000000000000000..0140ebd877703c2446c6dd9839951cc8040141f6 --- /dev/null +++ b/tests/model/BingBertSquad/test_e2e_squad.py @@ -0,0 +1,144 @@ +import subprocess as sp +import datetime +import os +from math import isclose +import sys +import pytest +import json +import argparse + +sys.path.append("../../../DeepSpeedExamples/BingBertSquad") +import evaluate as eval + +squad_dir = "/data/BingBertSquad" +base_dir = "../../../DeepSpeedExamples/BingBertSquad" + +script_file_name = "run_squad_deepspeed.sh" +model_file_name = "training_state_checkpoint_162.tar" +eval_file_name = "dev-v1.1.json" +pred_file_name = "predictions.json" + +num_gpus = "4" +timeout_sec = 5 * 60 * 60 # 5 hours + +eval_version = "1.1" + + +def create_config_file(tmpdir, zeroenabled=False): + config_dict = { + "train_batch_size": 24, + "train_micro_batch_size_per_gpu": 6, + "steps_per_print": 10, + "optimizer": { + "type": "Adam", + "params": { + "lr": 3e-5, + "weight_decay": 0.0, + "bias_correction": False + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": True + } + } + config_dict["zero_optimization"] = zeroenabled + + config_path = os.path.join(tmpdir, 'temp_config.json') + with open(config_path, 'w') as fd: + json.dump(config_dict, fd) + return config_path + + +def test_e2e_squad_deepspeed_base(tmpdir): + config_file = create_config_file(tmpdir) + + # base run results => {"exact_match": 83.9829706717124, "f1": 90.71138132004097} + expected_exact_match = 83.98 + expected_f1 = 90.71 + + model_file = os.path.join(squad_dir, model_file_name) + eval_file = os.path.join(squad_dir, eval_file_name) + + output_dir = os.path.join(tmpdir, "output") + pred_file = os.path.join(output_dir, pred_file_name) + + proc = sp.Popen([ + "bash", + script_file_name, + num_gpus, + model_file, + squad_dir, + output_dir, + config_file + ], + cwd=base_dir) + + try: + proc.communicate(timeout=timeout_sec) + + if os.path.exists(pred_file): + eval_result = eval.evaluate(eval_version, eval_file, pred_file) + + print("evaluation result: ", json.dumps(eval_result)) + + assert isclose(eval_result["exact_match"], + expected_exact_match, + abs_tol=1e-2) + assert isclose(eval_result["f1"], expected_f1, abs_tol=1e-2) + + else: + pytest.fail("Error: Run Failed") + + except sp.TimeoutExpired: + proc.kill() + pytest.fail("Error: Timeout") + except sp.CalledProcessError: + pytest.fail("Error: Run Failed") + + +def test_e2e_squad_deepspeed_zero(tmpdir): + config_file = create_config_file(tmpdir, True) + + # base run results => {"exact_match": 84.1438032166509, "f1": 90.89776136505441} + expected_exact_match = 84.14 + expected_f1 = 90.89 + + model_file = os.path.join(squad_dir, model_file_name) + eval_file = os.path.join(squad_dir, eval_file_name) + + output_dir = os.path.join(tmpdir, "output") + pred_file = os.path.join(output_dir, pred_file_name) + + proc = sp.Popen([ + "bash", + script_file_name, + num_gpus, + model_file, + squad_dir, + output_dir, + config_file + ], + cwd=base_dir) + + try: + proc.communicate(timeout=timeout_sec) + + if os.path.exists(pred_file): + eval_result = eval.evaluate(eval_version, eval_file, pred_file) + + print("evaluation result: ", json.dumps(eval_result)) + + assert isclose(eval_result["exact_match"], + expected_exact_match, + abs_tol=1e-2) + assert isclose(eval_result["f1"], expected_f1, abs_tol=1e-2) + + else: + pytest.fail("Error: Run Failed") + + except sp.TimeoutExpired: + proc.kill() + pytest.fail("Error: Timeout") + except sp.CalledProcessError: + pytest.fail("Error: Run Failed")