diff --git a/azure/README.md b/azure/README.md index 1cca695bfa7e1ef6a45a5f680134c97b86a46948..df222b9a2759f7e5e16516456c56689a15de1f6b 100644 --- a/azure/README.md +++ b/azure/README.md @@ -1,3 +1,3 @@ # Getting Started with DeepSpeed on Azure -Please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/) to get started with DeepSpeed on Azure! +The recommended and simplest method to try DeepSpeed on Azure is through [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/). For more details, please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/). diff --git a/azure/attach.sh b/azure/attach.sh deleted file mode 100755 index c23127b0fb61f62188f41aa2677e97c8121b0131..0000000000000000000000000000000000000000 --- a/azure/attach.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -name=${1-deepspeed} -docker exec -i -w /home/deepspeed -t $name /bin/bash diff --git a/azure/azure_config.json b/azure/azure_config.json deleted file mode 100644 index 9c61e4d3705c34442c7d53b2be24792b8cd61ca3..0000000000000000000000000000000000000000 --- a/azure/azure_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "num_vms": 2, - "location": "southcentralus", - "azure_sku": "Standard_NV6_Promo", - "ssh_private_key": "id_rsa", - "docker_ssh_port": 2222 -} diff --git a/azure/azure_ssh.sh b/azure/azure_ssh.sh deleted file mode 100755 index 3259a3c88341a4e35099cbbb34b105f0d737c6aa..0000000000000000000000000000000000000000 --- a/azure/azure_ssh.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -config_file=azure_config.json -if [ ! -f ${config_file} ]; then - echo "Cannot find $config_file" - exit 1 -fi - -location=`cat ${config_file} | jq .location | sed 's/"//g'` -rg=deepspeed_rg_$location - -while getopts 'c:' flag; do - case "${flag}" in - c) config_file="${OPTARG}" ;; - *) error "Unexpected option ${flag}" ;; - esac -done -shift $(expr $OPTIND - 1) -echo "Using $config_file" - -nodeid=$1 -cmds=${@:2} -echo $nodeid $cmds -ip_addr=`az vm list-ip-addresses -g $rg | jq .[${nodeid}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` - -ssh_private_key=`cat ${config_file} | jq .ssh_private_key | sed 's/"//g'` -if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi - -ssh -i ${ssh_private_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null deepspeed@${ip_addr} ${cmds} diff --git a/azure/build_docker_image.sh b/azure/build_docker_image.sh deleted file mode 100755 index e8617f0844f5abe4696f5968c08bc8387d442699..0000000000000000000000000000000000000000 --- a/azure/build_docker_image.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -docker build -t deepspeed:0.1 -f ../Dockerfile . diff --git a/azure/create_vms.sh b/azure/create_vms.sh deleted file mode 100755 index 257a011f035c73f81c62ab8084871f6df5a010fb..0000000000000000000000000000000000000000 --- a/azure/create_vms.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -azure_config=azure_config.json - -# Make sure jq is installed -command -v jq -if [ $? != 0 ]; then - echo "Missing dependency of jq, please 'apt-get install jq'" - exit 1 -fi - -if [ ! -f ${azure_config} ]; then - echo "Cannot find $azure_config" - exit 1 -fi -cat $azure_config - -num_vms=`cat ${azure_config} | jq .num_vms` -if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi -location=`cat ${azure_config} | jq .location | sed 's/"//g'` -if [ $location == "null" ]; then echo 'missing location in config'; exit 1; fi -azure_sku=`cat ${azure_config} | jq .azure_sku | sed 's/"//g'` -if [ $azure_sku == "null" ]; then echo 'missing azure_sku in config'; exit 1; fi -ssh_private_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'` -if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi -ssh_key=${ssh_private_key}.pub - -if [ ! -f ${ssh_private_key} ]; then - echo "Cannot find $ssh_private_key" - exit 1 -fi -if [ ! -f ${ssh_key} ]; then - echo "Cannot find $ssh_key" - exit 1 -fi - -resource_group=deepspeed_rg_$location -az group create --name ${resource_group} --location $location - -base_vm_name=deepspeed -vm_image="nvidia:ngc_azure_17_11:ngc_gpu_cloud_19_11_3:19.11.3" - -az vm image terms accept --urn ${vm_image} - -for i in `seq 0 $(( num_vms - 1))`; do - vm_name=${base_vm_name}_$i - echo "creating $vm_name" - az vm create \ - --resource-group ${resource_group} \ - --name ${vm_name} \ - --image ${vm_image} \ - --admin-username deepspeed \ - --size ${azure_sku} \ - --ssh-key-values ${ssh_key} -done diff --git a/azure/setup_docker.sh b/azure/setup_docker.sh deleted file mode 100755 index 7b8d5cfcdd51e88c562a82d344302d235f8bc865..0000000000000000000000000000000000000000 --- a/azure/setup_docker.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -azure_config=azure_config.json -if [ ! -f ${azure_config} ]; then - echo "Cannot find $azure_config" - exit 1 -fi -location=`cat ${azure_config} | jq .location | sed 's/"//g'` -rg=deepspeed_rg_$location - -parallel=true -command -v pdsh -if [ $? != 0 ]; then - echo "Installing pdsh will allow for the docker pull to be done in parallel across the cluster. See: 'apt-get install pdsh'" - parallel=false -fi - -ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'` -if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi -num_vms=`cat ${azure_config} | jq .num_vms` -if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi - -args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" -username=deepspeed - -update_script=" -docker pull deepspeed/deepspeed:latest; -ln -s workdir/DeepSpeed/azure/attach.sh attach.sh; -cd workdir/DeepSpeed; -git pull; -git submodule update --init --recursive; -bash azure/start_container.sh; -" - -if [ $parallel == true ]; then - echo "parallel docker pull" - hosts="" - for node_id in {0..1}; do - addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` - hosts="${addr},${hosts}" - done - PDSH_RCMD_TYPE=ssh PDSH_SSH_ARGS_APPEND=${args} pdsh -w $hosts -l ${username} $update_script -else - echo "sequential docker pull" - for node_id in `seq 0 $((num_vms - 1))`; do - ip_addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` - addr=${username}@${ip_addr} - ssh ${args} $addr $update_script - done -fi diff --git a/azure/setup_vms.sh b/azure/setup_vms.sh deleted file mode 100755 index 118bed2ce7279fc62086e913ca66f2fdba01b710..0000000000000000000000000000000000000000 --- a/azure/setup_vms.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -azure_config=azure_config.json -if [ ! -f ${azure_config} ]; then - echo "Cannot find $azure_config" - exit 1 -fi -location=`cat ${azure_config} | jq .location | sed 's/"//g'` -rg=deepspeed_rg_$location - -ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'` -if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi -docker_ssh_port=`cat ${azure_config} | jq .docker_ssh_port` -if [ $docker_ssh_port == "null" ]; then echo 'missing docker_ssh_port in config'; exit 1; fi - -username=deepspeed -args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" - -num_vms=`az vm list -g $rg | jq '. | length'` -first_ip_addr=`az vm list-ip-addresses -g $rg | jq .[0].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` -num_slots=`ssh $args ${username}@${first_ip_addr} 'nvidia-smi -L | wc -l'` -echo "number of slots per vm: $num_slots" - -hostfile=hostfile -ssh_config=config -echo -n "" > $hostfile -echo -n "" > $ssh_config -for node_id in `seq 0 $((num_vms - 1))`; do - private_ip_addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.privateIpAddresses[0] | sed 's/"//g'` - echo "worker-${node_id} slots=${num_slots}" >> hostfile - echo "Host worker-${node_id} - HostName ${private_ip_addr} - Port ${docker_ssh_port} - StrictHostKeyChecking no - " >> ${ssh_config} -done - -update_script=" -sudo mkdir -p /job; -sudo chmod -R 777 /job; -mkdir -p workdir; -git clone https://github.com/microsoft/DeepSpeed.git workdir/DeepSpeed; -" - -for node_id in `seq 0 $((num_vms - 1))`; do - ip_addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` - addr=${username}@${ip_addr} - echo "copying ssh keys, ssh config, hostfile to worker-${node_id}" - ssh $args ${addr} $update_script - scp $args ${ssh_key}* ${addr}:.ssh/ - scp $args ${ssh_config} ${addr}:.ssh/ - scp $args ${hostfile} ${addr}:/job/ -done -rm $hostfile $ssh_config diff --git a/azure/shutdown_vms.sh b/azure/shutdown_vms.sh deleted file mode 100755 index 75317118be436ef487544f8edef2e26f4a4829d0..0000000000000000000000000000000000000000 --- a/azure/shutdown_vms.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -azure_config=azure_config.json -if [ ! -f ${azure_config} ]; then - echo "Cannot find $azure_config" - exit 1 -fi - -delete=0 -while getopts 'd' flag; do - case "${flag}" in - d) delete=1 ;; - *) - echo "Unexpected option ${flag}" - exit 1 - ;; - esac -done - -num_vms=`cat ${azure_config} | jq .num_vms` -if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi -location=`cat ${azure_config} | jq .location | sed 's/"//g'` -if [ $location == "null" ]; then echo 'missing location in config'; exit 1; fi - -base_vm_name=deepspeed -resource_group=deepspeed_rg_$location - -for i in `seq 0 $(( num_vms - 1))`; do - vm_name=${base_vm_name}_$i - if [ $delete == 0 ]; then - echo "deallocating $vm_name" - az vm deallocate --resource-group $resource_group --name $vm_name --no-wait - else - echo "deleting $vm_name" - az vm delete -y --resource-group $resource_group --name $vm_name --no-wait - fi -done diff --git a/azure/start_container.sh b/azure/start_container.sh deleted file mode 100755 index 7e6aae5406b652700ec20bf3736aaeb70d89d0f0..0000000000000000000000000000000000000000 --- a/azure/start_container.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -name=${1-deepspeed} -image=deepspeed/deepspeed:latest -echo "starting docker image named $name" -docker run -d -t --name $name \ - --network host \ - -v ${HOME}/workdir:/home/deepspeed/workdir \ - -v ${HOME}/.ssh:/home/deepspeed/.ssh \ - -v /job/hostfile:/job/hostfile \ - --gpus all $image bash -c 'sudo service ssh start && sleep infinity' diff --git a/docs/_tutorials/azure.md b/docs/_tutorials/azure.md index 1016aeafd007c2257a24e54e735e6ec923869106..a2c558444844251eb1a99eacf8fbe9ad044438f9 100644 --- a/docs/_tutorials/azure.md +++ b/docs/_tutorials/azure.md @@ -3,132 +3,18 @@ title: "Getting Started with DeepSpeed on Azure" tags: getting-started --- -This tutorial will help you get started running DeepSpeed on [Azure virtual -machines](https://azure.microsoft.com/en-us/services/virtual-machines/). -Looking forward, we will be integrating these techniques and additional enhancements -into the [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/) platform to -benefit all your large model training jobs. +This tutorial will help you get started with DeepSpeed on Azure. If you don't already have an Azure account please see more details here: [https://azure.microsoft.com/](https://azure.microsoft.com/). -To use DeepSpeed on [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/), please take a look at easy-to-use examples for Transformers and CIFAR training from [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed). +# DeepSpeed on Azure via AzureML -To help with launching Azure instances we suggest using the [Azure -CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created -several helper scripts to get you quickly started using DeepSpeed with Azure. - * Install Azure CLI on your local box: [https://docs.microsoft.com/en-us/cli/azure/install-azure-cli](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli). - * Alternatively, you can use the Azure in-browser shell: [https://shell.azure.com/](https://shell.azure.com/). +The recommended and simplest method to try DeepSpeed on Azure is through [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/). Please take a look at easy-to-use examples for Megatron-DeepSpeed, Transformers and CIFAR training [here](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed). -## Create an SSH key -Generate an SSH key that will be used across this tutorial to SSH into your VMs and -between Docker containers. `ssh-keygen` is the recommended way of doing this. Our scripts -assume your key is located inside the same directory as the Azure scripts. +> Our [Megatron-DeepSpeed](https://github.com/microsoft/megatron-deepspeed) contains the most up to date [recipe](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml) for end-to-end training on AzureML. -## Azure Config JSON -Our helper scripts depend on the following a configuration JSON for deployment -and setup. We have provided a simple example JSON in `azure_config.json` that -sets up a basic environment with two VMs. This config uses the NV6_Promo -instance type which has one NVIDIA Tesla M60 GPU per VM. You can read more -details about the VM on the [Linux Virtual Machines -Pricing](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/) -page. +# DeepSpeed on Azure VMs -See the example below: - ```json -{ - "num_vms": 2, - "location": "southcentralus", - "azure_sku": "Standard_NV6_Promo", - "ssh_private_key": "id_rsa", - "docker_ssh_port": 2222 -} -``` +If you don't have access to AzureML or if want to build a custom environments using [Azure virtual machines](https://azure.microsoft.com/en-us/services/virtual-machines/) or Azure VM Scale-Sets ([VMSS](https://docs.microsoft.com/en-us/azure/virtual-machine-scale-sets/overview)), we are working on easy-to-use cluster setup scripts that will be published in the next few weeks. -## Dependencies -The scripts in this tutorial require [jq](https://stedolan.github.io/jq/) to help with -parsing JSON from the command line. Also it is recommended to install -[pdsh](https://linux.die.net/man/1/pdsh) to help launch ssh connections in parallel. - -## Create Azure VMs -We first need to allocate the VMs. We provide a script -```bash -./create_vms.sh -``` -to create VMs with the Azure SKU in the region specified in `azure_config.json`. Feel -free to customize your JSON to your desired region/SKU. This step will take a few minutes -to complete while it sets up all of your VMs on Azure. - -## Setup VM environment to use DeepSpeed -Next, we need to configure the VM environment for DeepSpeed. We provide a script -```bash -./setup_vms.sh -``` -to generate a [hostfile](/getting-started/#resource-configuration-multi-node) and SSH -configuration on all of the VMs. This configuration will be used by the DeepSpeed -Docker containers in the next step. - -## Start the DeepSpeed docker container -We now setup the DeepSpeed Docker containers on the VMs. We provide a script -```bash -./setup_docker.sh -``` -to pull the DeepSpeed image onto all VMs and start a container instance in the -background. This will take several minutes since it needs to pull the entire Docker -image. - -## Access VMs -The tool `azure_ssh.sh` will let you SSH into any of the VMs with this -syntax: -```bash -./azure_ssh.sh [command] -``` -where the `node-id` is a number between `0` and `num_vms-1`. This script will find the -public IP address of your VM and use the SSH key provided in the Azure configuration -JSON. - -## Access DeepSpeed container -Everything should be up and running at this point. Let's access the running DeepSpeed -container on the first VM and make sure we can talk to the other containers in our deployment. - - * SSH into the first VM via: `./azure_ssh.sh 0` - * Change directories into the azure folder of this repo via: `cd ~/workdir/DeepSpeed/azure` - * Attach the running docker container via: `./attach.sh` - * You should now be able to `ssh` into any other docker container, the containers can be - accessed via their SSH alias of `worker-N`, where `N` is the VM number between `0` - and `num_vms-1`. In this example we should be able to successfully run `ssh worker-1 - hostname` which will return the hostname of worker-1. - -## Parallel SSH across containers - DeepSpeed comes installed with a helper script `ds_ssh` which is a wrapper around - the [pdsh](https://linux.die.net/man/1/pdsh) command that lets you issue commands - to groups of hosts (via SSH) in parallel. This wrapper simply connects with the - hostfile that defines all the containers in your deployment. For example if you run - `ds_ssh hostname` you should see a list of all the hostnames in your deployment. - -## Run CIFAR-10 example model -We will now run the DeepSpeed CIFAR-10 model example to test the VM setup. From inside -the first DeepSpeed container: - - 1) Install the python dependencies necessary to run the CIFAR-10 example model. You can - do this across your cluster via: - ```bash - ds_ssh pip install -r ~/workdir/DeepSpeed/DeepSpeedExamples/cifar/requirements.txt - ``` - - 2) Now change directories to the CIFAR example: - ```bash - cd ~/workdir/DeepSpeed/DeepSpeedExamples/cifar - ``` - - 3) Finally, launch training across all VMs: - ```bash - deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json - ``` - -## Megatron-LM GPT2 -DeepSpeed includes an example model using Megatron-LM's GPT2. Please refer to the full -[Megatron tutorial](/tutorials/megatron/) for more details. - * In order to fully train GPT2 with DeepSpeed and ZeRO we recommend using 8 instances of - Azure's Standard_ND40rs_v2 SKU for a total of 64 NVIDIA V100 GPUs. With this setup and - a batch size of 1536 you should be able to complete 100k training steps (153.6 million - samples) in less than 2 weeks of training. +If you already have a cluster setup, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azure) that can easily be modified to train various model configurations.