Merge branch 'master' into olruwase/legacy_optimizer_fusion

5ac1848c · Olatunji Ruwase · GitHub · 3487b72e · bca23057 · e0d2d7f4
15 changed file
--- a/DeepSpeedExamples @ e0d2d7f4
+++ b/DeepSpeedExamples @ e0d2d7f4
-Subproject commit 07d1ce0d26044602a7b8bb289e590a980f14aded
+Subproject commit e0d2d7f4a86f03612bc0a210a5e4dbcc798b48a6
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -48,17 +48,33 @@ jobs:
      ln -s /data/Megatron-LM/data DeepSpeedExamples/Megatron-LM/
      pip install --user -r DeepSpeedExamples/Megatron-LM/requirements.txt
      cd tests/model/
-      #pytest -s run_sanity_check.py
+      pytest -s run_sanity_check.py
    displayName: 'Model tests'

-  # BingBertSquad logs
+   #BingBertSquad logs
+  - task: PublishPipelineArtifact@1
+    inputs:
+      targetPath: '$(Build.SourcesDirectory)/tests/model/BingBertSquad/test/'
+      artifactName: BingBertSquad_logs
+    displayName: 'BingBertSquad log uploads'
+    condition: always()
+
+  # Megatron test logs
+  #- task: PublishPipelineArtifact@1
+  #  inputs:
+  #    targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/test/'
+  #    artifactName: Megatron_GPT2_logs
+  #  displayName: 'Megatron GPT2 log uploads'
+  #  condition: always()
+
  #- task: PublishPipelineArtifact@1
  #  inputs:
-  #    targetPath: '$(Build.SourcesDirectory)/tests/model/BingBertSquad/test/'
-  #    artifactName: BingBertSquad_logs
-  #  displayName: 'BingBertSquad logs'
+  #    targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/checkpoint_test_logs/'
+  #    artifactName: Megatron_GPT2_checkpoint_logs
+  #  displayName: 'Megatron GPT2 checkpoint log uploads'
  #  condition: always()

+
  #BingBert logs
  #- task: PublishPipelineArtifact@1
  #  inputs:
@@ -73,21 +89,3 @@ jobs:
  #    artifactName: BingBert_checkpoint_logs
  #  displayName: 'BingBert checkpoint logs'
  #  condition: always()
-
-
-  # XXX temporarily disabled
-
-  # Megatron test logs
-  #- task: PublishPipelineArtifact@1
-  #  inputs:
-  #    targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/test/'
-  #    artifactName: Megatron_GPT2_logs
-  #  displayName: 'Megatron GPT2 logs'
-  #  condition: always()
-
-  #- task: PublishPipelineArtifact@1
-  #  inputs:
-  #    targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/checkpoint_test_logs/'
-  #    artifactName: Megatron_GPT2_checkpoint_logs
-  #  displayName: 'Megatron GPT2 checkpoint logs'
-  #  condition: always()
--- a/deepspeed/pt/fp16_unfused_optimizer.py
+++ b/deepspeed/pt/fp16_unfused_optimizer.py
@@ -116,8 +116,13 @@ class FP16_UnfusedOptimizer(object):
        grads_groups = []
        norm_groups = []
        for i, group in enumerate(self.fp16_groups):
-            grads_groups.append([p.grad for p in group])
-            grads_groups_flat.append(_flatten_dense_tensors(grads_groups[i]))
+            grads = [
+                torch.zeros(p.size(),
+                            dtype=p.dtype,
+                            device=p.device) if p.grad is None else p.grad for p in group
+            ]
+            grads_groups.append(grads)
+            grads_groups_flat.append(_flatten_dense_tensors(grads))
            norm_groups.append(get_weight_norm(grads_groups_flat[i], mpu=self.mpu))

        self.overflow = self.overflow_checker.check_using_norm(norm_groups)
@@ -162,7 +167,12 @@ class FP16_UnfusedOptimizer(object):

            # copying gradients to fp32 to work with fp32 parameters
            for fp32_param, fp16_param in zip(self.fp32_groups[i], self.fp16_groups[i]):
-                fp32_param.grad = fp16_param.grad.to(fp32_param.dtype)
+                if fp16_param.grad is None:
+                    fp32_param.grad = torch.zeros(fp16_param.size(),
+                                                  dtype=fp32_param.dtype,
+                                                  device=fp32_param.device)
+                else:
+                    fp32_param.grad = fp16_param.grad.to(fp32_param.dtype)

        self.unscale_and_clip_grads(norm_groups)


--- a/docs/tutorials/CIFAR-10.md
+++ b/docs/tutorials/CIFAR-10.md
@@ -18,6 +18,12 @@ Original model code from [CIFAR-10 Tutorial](https://github.com/pytorch/tutorial
 git submodule update --init --recursive
 ```

+To install requirements for CIFAR-10:
+```
+cd DeepSpeedExamples/cifar
+pip install -r requirements.txt
+```
+
 Run `python cifar10_tutorial.py`, it downloads the training data set at first run.
 ```less
 Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz

--- a/requirements.txt
+++ b/requirements.txt
 torch==1.2
 torchvision==0.4.0
+pillow==6.2.2
 tqdm
 psutil
 tensorboardX==1.8

--- a/tests/model/BingBertSquad/BingBertSquad_run_func_test.py
+++ b/tests/model/BingBertSquad/BingBertSquad_run_func_test.py
+# coding=utf-8
+# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+#
+# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+
+import unittest
+import subprocess
+import os
+import time
+import re
+from .BingBertSquad_test_common import BaseTestCase
+
+
+def grep_loss_from_file(file_name):
+    loss = 0.0
+
+    with open(file_name, 'r') as f:
+        lines = f.readlines()
+        line_filter = "bert_squad_progress: step="
+        match_number = re.compile('loss=([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
+
+        for line in lines:
+            if line_filter in line:
+                loss = re.findall(match_number, line)
+                loss = float(loss[0])
+
+    if loss == 0.0:
+        print("no loss found in file ", file_name)
+
+    return loss
+
+
+class BingBertSquadFuncTestCase(BaseTestCase):
+    def __init__(self, methodName="DeepSpeed function test on BingBertSquad model"):
+        super(BingBertSquadFuncTestCase, self).__init__(methodName)
+
+    def setUp(self):
+        self.save_dir = os.getcwd()
+        new_dir = os.path.dirname(__file__)
+        if new_dir:
+            os.chdir(new_dir)
+
+    def tearDown(self):
+        os.chdir(self.save_dir)
+
+    def test_gpu4_fp16(self):
+        test_config = {
+            "gpus": 4,
+            "deepspeed": False,
+            "json": "deepspeed_bsz24_fp16_config.json",
+            "max_steps": 8,
+            "max_epoch_steps": 4,
+            "other_args": "--fp16 --print_steps 1"
+        }
+
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_gpu1_fp16(self):
+        test_config = {
+            "gpus": 1,
+            "deepspeed": False,
+            "json": "deepspeed_bsz24_fp16_config.json",
+            "max_steps": 8,
+            "max_epoch_steps": 4,
+            "other_args": "--fp16 --print_steps 1"
+        }
+
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_gpu4_fp32(self):
+        test_config = {
+            "gpus": 4,
+            "deepspeed": False,
+            "json": "deepspeed_bsz24_fp32_config.json",
+            "max_steps": 8,
+            "max_epoch_steps": 4,
+            "other_args": "--print_steps 1"
+        }
+
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_gpu1_fp32(self):
+        test_config = {
+            "gpus": 1,
+            "deepspeed": False,
+            "json": "deepspeed_bsz24_fp32_config.json",
+            "max_steps": 8,
+            "max_epoch_steps": 4,
+            "other_args": "--print_steps 1"
+        }
+
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
+    def run_test(self, test_config, r_tol):
+        print("\n")
+        print("{0}: starting......".format(self.id()))
+
+        prefix = "BingBertSquad_func"
+
+        test_config['other_args'] += f" --max_steps {test_config['max_steps']}"
+        test_config[
+            'other_args'] += f" --max_steps_per_epoch {test_config['max_epoch_steps']}"
+
+        # baseline run...
+        test_config["deepspeed"] = False
+        base_file = self.gen_output_name(test_config, prefix)
+
+        # skip baseline run if it exists.
+        if not self.has_loss_data(base_file):
+            print("{0}: baseline run.".format(self.id()))
+            self.run_BingBertSquad_test(test_config, base_file)
+        else:
+            print("{0}: baseline exists.".format(self.id()))
+
+        # DeepSpeed run...
+        test_config["deepspeed"] = True
+        print("{0}: DeepSpeed run.".format(self.id()))
+        test_file = self.gen_output_name(test_config, prefix)
+        self.run_BingBertSquad_test(test_config, test_file)
+
+        return self.check_parity(base_file, test_file, r_tol)
+
+    def has_loss_data(self, file_name):
+        has_loss = False
+        if os.path.exists(file_name):
+            loss = grep_loss_from_file(file_name)
+            if loss != 0.0:
+                has_loss = True
+
+        return has_loss
+
+    def check_parity(self, base_file, test_file, r_tol):
+        base_loss = grep_loss_from_file(base_file)
+        test_loss = grep_loss_from_file(test_file)
+
+        print("baseline loss: {0}, test loss: {1}".format(base_loss, test_loss))
+
+        if base_loss == 0.0 or test_loss == 0.0:
+            return False
+
+        if abs((base_loss - test_loss) / base_loss) > r_tol:
+            return False
+
+        return True
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(BingBertSquadFuncTestCase('test_gpu4_fp16'))
+    suite.addTest(BingBertSquadFuncTestCase('test_gpu1_fp16'))
+    suite.addTest(BingBertSquadFuncTestCase('test_gpu4_fp32'))
+    suite.addTest(BingBertSquadFuncTestCase('test_gpu1_fp32'))
+    return suite
+
+
+if __name__ == '__main__':
+    runner = unittest.TextTestRunner(failfast=True)
+    runner.run(suite())
--- a/tests/model/BingBertSquad/BingBertSquad_test_common.py
+++ b/tests/model/BingBertSquad/BingBertSquad_test_common.py
+# coding=utf-8
+# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+#
+
+import unittest
+import subprocess
+import os
+import time
+import re
+
+
+class BaseTestCase(unittest.TestCase):
+    def __init__(self, methodName="DeepSpeed performance test"):
+        super(BaseTestCase, self).__init__(methodName)
+        self.test_dir = "./test"
+        self.baseline_dir = "./baseline"
+        self.timestr = time.strftime("%Y%m%d-%H%M%S")
+
+    def gen_output_name(self, test_config, prefix):
+        other_args = test_config["other_args"] if "other_args" in test_config else ""
+        zero_args = "_zero" if "zero" in test_config and test_config["zero"] else ""
+        other_args = other_args.strip(' -\\').replace(" ", "").replace("\"", "")
+
+        if other_args:
+            other_args = "_" + other_args
+
+        if test_config["deepspeed"]:
+            file_name = "_gpu{0}_{1}_ds{2}-{3}.log".format(test_config["gpus"],
+                                                           other_args,
+                                                           zero_args,
+                                                           self.timestr)
+            save_dir = self.test_dir
+        else:
+            file_name = "_gpu{0}_{1}.log".format(test_config["gpus"], other_args)
+            save_dir = self.baseline_dir
+
+        return os.path.join(save_dir, prefix + file_name)
+
+    def ensure_directory_exists(self, filename):
+        dirname = os.path.dirname(filename)
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+
+    def clean_test_env(self):
+        cmd = "dlts_ssh pkill -9 -f /usr/bin/python"
+        print(cmd)
+        subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+        time.sleep(20)
+
+    def run_BingBertSquad_test(self, test_config, output):
+        ds_flag = " -d --deepspeed_config " + test_config["json"] if test_config[
+            "deepspeed"] else " "
+        other_args = " " + test_config[
+            "other_args"] if "other_args" in test_config else " "
+
+        cmd = "./run_BingBertSquad_sanity.sh -e 1 -g {0} {1} {2}".format(
+            test_config["gpus"],
+            other_args,
+            ds_flag)
+
+        self.ensure_directory_exists(output)
+        with open(output, "w") as f:
+            print(cmd)
+            subprocess.run(cmd,
+                           shell=True,
+                           check=False,
+                           executable='/bin/bash',
+                           stdout=f,
+                           stderr=f)
--- a/tests/model/BingBertSquad/__init__.py
+++ b/tests/model/BingBertSquad/__init__.py
+# coding=utf-8
+# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+
+from .BingBertSquad_run_func_test import BingBertSquadFuncTestCase
+from .BingBertSquad_run_func_test import suite
--- a/tests/model/BingBertSquad/deepspeed_bsz24_fp16_config.json
+++ b/tests/model/BingBertSquad/deepspeed_bsz24_fp16_config.json
+{
+  "tensorboard": {
+    "enabled": false,
+    "job_name": "MyJob"
+  },
+  "zero_optimization": true,
+  "disable_allgather": false,
+  "allgather_size": 200000,
+  "wall_clock_breakdown": false,
+  "train_batch_size": 24,
+  "train_micro_batch_size_per_gpu": 3,
+  "steps_per_print": 1,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 3e-5,
+      "max_grad_norm": 1.0,
+      "weight_decay": 0.0,
+      "bias_correction": false
+    }
+  },
+  "fp16": {
+    "enabled": true
+  }
+}
--- a/tests/model/BingBertSquad/deepspeed_bsz24_fp32_config.json
+++ b/tests/model/BingBertSquad/deepspeed_bsz24_fp32_config.json
+{
+  "train_batch_size": 24,
+  "train_micro_batch_size_per_gpu": 3,
+  "steps_per_print": 1,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 3e-5,
+      "max_grad_norm": 1.0,
+      "weight_decay": 0.0,
+      "bias_correction": false
+    }
+  },
+  "fp16": {
+    "enabled": false
+  }
+}
--- a/tests/model/BingBertSquad/run_BingBertSquad.sh
+++ b/tests/model/BingBertSquad/run_BingBertSquad.sh
+#!/bin/bash
+
+usage() {
+  echo """
+Usage: $0 [defined arguments...] [other arguments...]
+
+[defined]
+	-g, --num_gpus          num gpus per node to use
+	-h, --help              this help text
+	-n, --num_nodes         num nodes to use
+	-e, --epochs		        num of training epochs
+	-b, --batch_size 	      training batch size
+  -p, --port              master port for nccl
+
+[other arguments]
+	all undefined arguments will be passed to the user's application
+  """
+}
+
+validate_folder() {
+	dir=$1
+	dir_name=$2
+
+	if [[ -d ${dir} ]]; then
+		echo "Using ${dir_name}: ${dir}"
+	else
+		echo "${dir} folder not found"
+		exit 1
+	fi
+}
+
+remove_folder() {
+	dir=$1
+	dir_name=$2
+
+	if [[ -d ${dir} ]]; then
+		echo "The variable ${dir_name} is set to ${dir} which already exists, so removing and creating a fresh one"
+    rm -rvf ${dir}
+	fi
+}
+
+num_nodes=1
+num_gpus=8
+epochs=2
+batch_size=24
+enable_deepspeed=false
+master_port=$((20000+RANDOM%5000))
+LR=3e-5
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    -g|--num_gpus)
+    num_gpus="$2"
+    shift
+    shift
+    ;;
+    -n|--num_nodes)
+    num_nodes="$2"
+    shift
+    shift
+    ;;
+    -e|--epochs)
+    epochs="$2"
+    shift
+    shift
+    ;;
+    -b|--batch_size)
+    batch_size="$2"
+    shift
+    shift
+    ;;
+    -p|--master_port)
+    master_port="$2"
+    shift
+    shift
+    ;;
+    -d|--deepspeed)
+    enable_deepspeed=true
+    shift
+    ;;
+    -h|--help)
+    usage
+    exit 0
+    ;;
+    *) # other arguments
+    other_args="${other_args} $1"
+    shift
+    ;;
+esac
+done
+
+# Validate path to BingBertSquad script
+if [ -z "${BingBertSquad_DIR+x}" ]; then
+  export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
+  echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
+fi
+validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
+
+# Validate path to processed Squad data
+if [ -z "${SQUAD_DIR+x}" ]; then
+  export SQUAD_DIR=/data/BingBertSquad
+  echo "SQUAD_DIR environment variable not set; trying default: ${SQUAD_DIR}"
+fi
+validate_folder ${SQUAD_DIR} "SQUAD_DIR"
+
+# Set output path
+if [ -z "${OUTPUT_DIR+x}" ]; then
+  export OUTPUT_DIR=/tmp/BingBertSquad-Output
+  echo "OUTPUT_DIR environment variable not set; trying default: ${OUTPUT_DIR}"
+fi
+remove_folder ${OUTPUT_DIR} "OUTPUT_DIR"
+
+echo "num_nodes: ${num_nodes}"
+echo "num_gpus: ${num_gpus}"
+echo "epochs: ${epochs}"
+echo "batch_size: ${batch_size}"
+echo "master_port: ${master_port}"
+echo "deepspeed: ${enable_deepspeed}"
+echo "other_args: ${other_args}"
+
+EFFECTIVE_BATCH_SIZE=${batch_size}
+MAX_GPU_BATCH_SIZE=6
+PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/num_gpus))
+if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
+       GRAD_ACCUM_STEPS=1
+else
+       GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
+fi
+
+if [[ ${enable_deepspeed} == true ]]; then
+  BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_deepspeed.py
+else
+  BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_baseline.py
+fi
+
+JOB_NAME="BingBertSquad_ds-${enable_deepspeed}_${num_gpus}-gpu"
+
+#            --do_predict \
+squad_args="--bert_model bert-large-uncased \
+            --do_train \
+            --do_lower_case \
+            --train_file ${SQUAD_DIR}/train-v1.1.json \
+            --predict_file ${SQUAD_DIR}/dev-v1.1.json \
+            --train_batch_size ${PER_GPU_BATCH_SIZE} \
+            --learning_rate ${LR} \
+            --num_train_epochs ${epochs} \
+            --max_seq_length 384 \
+            --doc_stride 128 \
+            --output_dir ${OUTPUT_DIR} \
+            --gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
+		      	--job_name ${JOB_NAME} \
+            --model_file ${SQUAD_DIR}/training_state_checkpoint_162.tar
+            "
+
+run_cmd="deepspeed.pt \
+      --num_nodes ${num_nodes} \
+      --num_gpus ${num_gpus} \
+      --master_port ${master_port}
+      ${BingBertSquad_script} ${other_args} ${squad_args}"
+
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
+
+#python ${BingBertSquad_DIR}/evaluate-v1.1.py ${SQUAD_DIR}/dev-v1.1.json ${OUTPUT_DIR}/predictions.json > ${OUTPUT_DIR}/CorrectnessScores.txt
--- a/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
+++ b/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
+#!/bin/bash
+
+usage() {
+  echo """
+Usage: $0 [defined arguments...] [other arguments...]
+
+[defined]
+	-g, --num_gpus          num gpus per node to use
+	-h, --help              this help text
+	-n, --num_nodes         num nodes to use
+	-e, --epochs		        num of training epochs
+	-b, --batch_size 	      training batch size
+  -p, --port              master port for nccl
+
+[other arguments]
+	all undefined arguments will be passed to the user's application
+  """
+}
+
+validate_folder() {
+	dir=$1
+	dir_name=$2
+
+	if [[ -d ${dir} ]]; then
+		echo "Using ${dir_name}: ${dir}"
+	else
+		echo "${dir} folder not found"
+		exit 1
+	fi
+}
+
+remove_folder() {
+	dir=$1
+	dir_name=$2
+
+	if [[ -d ${dir} ]]; then
+		echo "The variable ${dir_name} is set to ${dir} which already exists, so removing and creating a fresh one"
+    rm -rvf ${dir}
+	fi
+}
+
+num_nodes=1
+num_gpus=8
+epochs=2
+batch_size=24
+enable_deepspeed=false
+master_port=$((20000+RANDOM%5000))
+LR=3e-5
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    -g|--num_gpus)
+    num_gpus="$2"
+    shift
+    shift
+    ;;
+    -n|--num_nodes)
+    num_nodes="$2"
+    shift
+    shift
+    ;;
+    -e|--epochs)
+    epochs="$2"
+    shift
+    shift
+    ;;
+    -b|--batch_size)
+    batch_size="$2"
+    shift
+    shift
+    ;;
+    -p|--master_port)
+    master_port="$2"
+    shift
+    shift
+    ;;
+    -d|--deepspeed)
+    enable_deepspeed=true
+    echo "Found deespcale flag"
+    shift
+    ;;
+    -h|--help)
+    usage
+    exit 0
+    ;;
+    *) # other arguments
+    other_args="${other_args} $1"
+    shift
+    ;;
+esac
+done
+
+# Validate path to BingBertSquad script
+if [ -z "${BingBertSquad_DIR+x}" ]; then
+  export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
+  echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
+fi
+validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
+
+# Validate path to processed Squad data
+if [ -z "${SQUAD_DIR+x}" ]; then
+  export SQUAD_DIR=/data/BingBertSquad
+  echo "SQUAD_DIR environment variable not set; trying default: ${SQUAD_DIR}"
+fi
+validate_folder ${SQUAD_DIR} "SQUAD_DIR"
+
+# Set output path
+if [ -z "${OUTPUT_DIR+x}" ]; then
+  export OUTPUT_DIR=/tmp/BingBertSquad-Output
+  echo "OUTPUT_DIR environment variable not set; trying default: ${OUTPUT_DIR}"
+fi
+remove_folder ${OUTPUT_DIR} "OUTPUT_DIR"
+
+echo "num_nodes: ${num_nodes}"
+echo "num_gpus: ${num_gpus}"
+echo "epochs: ${epochs}"
+echo "batch_size: ${batch_size}"
+echo "master_port: ${master_port}"
+echo "deepspeed: ${enable_deepspeed}"
+echo "other_args: ${other_args}"
+
+EFFECTIVE_BATCH_SIZE=${batch_size}
+MAX_GPU_BATCH_SIZE=3
+PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/num_gpus))
+if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
+       GRAD_ACCUM_STEPS=1
+else
+       GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
+fi
+
+if [[ ${enable_deepspeed} == true ]]; then
+  BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_deepspeed.py
+else
+  BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_baseline.py
+fi
+
+JOB_NAME="BingBertSquad_ds-${enable_deepspeed}_${num_gpus}-gpu"
+
+squad_args="--bert_model bert-large-uncased \
+            --do_train \
+            --do_lower_case \
+            --train_file ${SQUAD_DIR}/train-v1.1.json \
+            --predict_file ${SQUAD_DIR}/dev-v1.1.json \
+            --train_batch_size ${PER_GPU_BATCH_SIZE} \
+            --learning_rate ${LR} \
+            --num_train_epochs ${epochs} \
+            --max_seq_length 384 \
+            --doc_stride 128 \
+            --output_dir ${OUTPUT_DIR} \
+            --gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
+		      	--job_name ${JOB_NAME} \
+            --model_file ${SQUAD_DIR}/training_state_checkpoint_162.tar
+            "
+
+run_cmd="deepspeed.pt \
+      --num_nodes ${num_nodes} \
+      --num_gpus ${num_gpus} \
+      --master_port ${master_port}
+      ${BingBertSquad_script} ${other_args} ${squad_args}"
+
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
+
+#python ${BingBertSquad_DIR}/evaluate-v1.1.py ${SQUAD_DIR}/dev-v1.1.json ${OUTPUT_DIR}/predictions.json > ${OUTPUT_DIR}/CorrectnessScores.txt
--- a/tests/model/BingBertSquad/run_tests.sh
+++ b/tests/model/BingBertSquad/run_tests.sh
+#!/bin/bash
+
+if [[ ! -d logs ]]
+then
+        mkdir logs
+fi
+
+validate_file() {
+  file=$1
+  file_name=$2
+
+  if [[ -f $file ]]; then
+    echo "Using ${file_name}: ${file}"
+  else
+    echo "${file} not found"
+    exit 1
+  fi
+}
+
+validate_folder() {
+	dir=$1
+	dir_name=$2
+
+	if [[ -d ${dir} ]]; then
+		echo "Using ${dir_name}: ${dir}"
+	else
+		echo "${dir} folder not found"
+		exit 1
+	fi
+}
+
+# Validate path to BingBertSquad script
+if [ -z "${BingBertSquad_DIR+x}" ]; then
+  export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
+  echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
+fi
+validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
+
+fp16_config_json=deepspeed_bsz24_fp16_config.json
+validate_file ${fp16_config_json} "fp16_config_json"
+fp32_config_json=deepspeed_bsz24_fp32_config.json
+validate_file ${fp32_config_json} "fp32_config_json"
+
+start_time=`date +"%D %T"`
+echo "---------------begin @ ${start_time}--------------"
+# Note: you may play around with commented parts below (num_gpus and nohup command) for simultaneous runs; just make sure your hardware allocation can support it
+for num_gpus in 8 1 # 4 2
+do
+        #run_cmd="nohup bash run_BingBertSquad.sh -g ${num_gpus} -d --deepspeed_config ${fp16_config_json} --fp16 > logs/deepspeed_fp16_${num_gpus}_`date +"%Y%m%d%H%M%S"`.out 2> logs/deepspeed_fp16_${num_gpus}_`date +"%Y%m%d%H%M%S"`.err &"
+        run_cmd="bash run_BingBertSquad.sh -g ${num_gpus} -d --deepspeed_config ${fp16_config_json} --fp16"
+        start_time=`date +"%D %T"`
+        echo "---------------begin @ ${start_time}--------------"
+        echo ${run_cmd}
+        eval ${run_cmd}
+        end_time=`date +"%D %T"`
+        echo "---------------finish @ ${end_time} --------------"
+
+        #run_cmd="nohup bash run_BingBertSquad.sh -g ${num_gpus} -d --deepspeed_config ${fp32_config_json} > logs/deepspeed_fp32_${num_gpus}_`date +"%Y%m%d%H%M%S"`.out 2> logs/deepspeed_fp32_${num_gpus}_`date +"%Y%m%d%H%M%S"`.err &"
+        run_cmd="bash run_BingBertSquad.sh -g ${num_gpus} -d --deepspeed_config ${fp32_config_json}"
+        start_time=`date +"%D %T"`
+        echo "---------------begin @ ${start_time}--------------"
+        echo ${run_cmd}
+        eval ${run_cmd}
+        end_time=`date +"%D %T"`
+        echo "---------------finish @ ${end_time} --------------"
+
+        #run_cmd="nohup bash run_BingBertSquad.sh -g ${num_gpus} --fp16 > logs/baseline_fp16_${num_gpus}_`date +"%Y%m%d%H%M%S"`.out 2> logs/baseline_fp16_${num_gpus}_`date +"%Y%m%d%H%M%S"`.err &"
+        run_cmd="bash run_BingBertSquad.sh -g ${num_gpus} --fp16"
+        start_time=`date +"%D %T"`
+        echo "---------------begin @ ${start_time}--------------"
+        echo ${run_cmd}
+        eval ${run_cmd}
+        end_time=`date +"%D %T"`
+        echo "---------------finish @ ${end_time} --------------"
+
+        #run_cmd="nohup bash run_BingBertSquad.sh -g ${num_gpus} > logs/baseline_fp32_${num_gpus}_`date +"%Y%m%d%H%M%S"`.out 2> logs/baseline_fp32_${num_gpus}_`date +"%Y%m%d%H%M%S"`.err &"
+        run_cmd="bash run_BingBertSquad.sh -g ${num_gpus}"
+        start_time=`date +"%D %T"`
+        echo "---------------begin @ ${start_time}--------------"
+        echo ${run_cmd}
+        eval ${run_cmd}
+        end_time=`date +"%D %T"`
+        echo "---------------finish @ ${end_time} --------------"
+done
+end_time=`date +"%D %T"`
+echo "---------------finish @ ${end_time} --------------"
+
+set +x
--- a/tests/model/run_sanity_check.py
+++ b/tests/model/run_sanity_check.py
@@ -7,11 +7,13 @@ import sys
 import unittest

 sys.path.append('../DeepSpeedExamples/Megatron_GPT2')
+sys.path.append('../DeepSpeedExamples/BingBertSquad')

 import os

 # Import the test cases here.
 import Megatron_GPT2
+import BingBertSquad


 def pytest_hack(runner_result):
@@ -31,8 +33,9 @@ def test_run():
    runner = unittest.TextTestRunner(failfast=True)

    # Add test suites here.
-    pytest_hack(runner.run(Megatron_GPT2.suite()))
-    pytest_hack(runner.run(Megatron_GPT2.checkpoint_suite()))
+    #pytest_hack(runner.run(Megatron_GPT2.suite()))
+    #pytest_hack(runner.run(Megatron_GPT2.checkpoint_suite()))
+    pytest_hack(runner.run(BingBertSquad.suite()))


 if __name__ == '__main__':

--- a/tests/unit/test_fp16.py
+++ b/tests/unit/test_fp16.py
+import torch
+import deepspeed
+import argparse
+import pytest
+import json
+import os
+from common import distributed_test
+
+
+def create_config_from_dict(tmpdir, config_dict):
+    config_path = os.path.join(tmpdir, 'temp_config.json')
+    with open(config_path, 'w') as fd:
+        json.dump(config_dict, fd)
+    return config_path
+
+
+class SimpleModel(torch.nn.Module):
+    def __init__(self, hidden_dim, empty_grad=False):
+        super(SimpleModel, self).__init__()
+        self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
+        if empty_grad:
+            self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        hidden_dim = x
+        hidden_dim = self.linear(hidden_dim)
+        return self.cross_entropy_loss(hidden_dim, y)
+
+
+def test_temp_config_json(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+    }
+    config_path = create_config_from_dict(tmpdir, config_dict)
+    config_json = json.load(open(config_path, 'r'))
+    assert 'train_batch_size' in config_json
+
+
+def prepare_optimizer_parameters(model):
+    param_optimizer = list(model.named_parameters())
+    optimizer_grouped_parameters = [{
+        'params': [p for n,
+                   p in param_optimizer],
+        'weight_decay': 0.0
+    }]
+    return optimizer_grouped_parameters
+
+
+def get_data_loader(model, total_samples, hidden_dim, device):
+    batch_size = model.train_micro_batch_size_per_gpu()
+    train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
+    train_label = torch.empty(total_samples,
+                              dtype=torch.long,
+                              device=device).random_(hidden_dim)
+    train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
+    return train_loader
+
+
+def get_args(tmpdir, config_dict):
+    config_path = create_config_from_dict(tmpdir, config_dict)
+    parser = argparse.ArgumentParser()
+    args = parser.parse_args(args='')
+    args.deepspeed = True
+    args.deepspeed_config = config_path
+    args.local_rank = 0
+    return args
+
+
+def test_lamb_fp16_basic(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Lamb",
+            "params": {
+                "lr": 0.00015,
+                "max_grad_norm": 1.0
+            }
+        },
+        "fp16": {
+            "enabled": True
+        }
+    }
+    args = get_args(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_lamb_fp16_basic(args, model, hidden_dim):
+        model, _, _,_ = deepspeed.initialize(args=args,
+                                             model=model,
+                                             model_parameters=model.parameters(),
+                                             dist_init_required=False)
+        data_loader = get_data_loader(model=model,
+                                      total_samples=50,
+                                      hidden_dim=hidden_dim,
+                                      device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_lamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_lamb_fp16_empty_grad(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Lamb",
+            "params": {
+                "lr": 0.00015,
+                "max_grad_norm": 1.0
+            }
+        },
+        "fp16": {
+            "enabled": True
+        }
+    }
+    args = get_args(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=True)
+
+    @distributed_test(world_size=[1])
+    def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
+        model, _, _,_ = deepspeed.initialize(args=args,
+                                             model=model,
+                                             model_parameters=model.parameters(),
+                                             dist_init_required=False)
+        data_loader = get_data_loader(model=model,
+                                      total_samples=50,
+                                      hidden_dim=hidden_dim,
+                                      device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_lamb_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_adamw_fp16_basic(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "fp16": {
+            "enabled": True
+        }
+    }
+    args = get_args(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_adamw_fp16_basic(args, model, hidden_dim):
+        optimizer = torch.optim.AdamW(params=model.parameters())
+        model, _, _,_ = deepspeed.initialize(args=args,
+                                             model=model,
+                                             optimizer=optimizer,
+                                             dist_init_required=False)
+        data_loader = get_data_loader(model=model,
+                                      total_samples=50,
+                                      hidden_dim=hidden_dim,
+                                      device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_adamw_fp16_empty_grad(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "fp16": {
+            "enabled": True
+        }
+    }
+    args = get_args(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=True)
+
+    @distributed_test(world_size=[1])
+    def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
+        optimizer = torch.optim.AdamW(params=model.parameters())
+        model, _, _,_ = deepspeed.initialize(args=args,
+                                             model=model,
+                                             optimizer=optimizer,
+                                             dist_init_required=False)
+        data_loader = get_data_loader(model=model,
+                                      total_samples=50,
+                                      hidden_dim=hidden_dim,
+                                      device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_adamw_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)