install.sh 5.5 KB
Newer Older
J
Jeff Rasley 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
#!/bin/bash

set -e
err_report() {
    echo "Error on line $1"
    echo "Fail to install deepspeed"
}
trap 'err_report $LINENO' ERR

usage() {
  echo """
Usage: install.sh [options...]

By default will install deepspeed and all third party dependecies accross all machines listed in
hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally

[optional]
    -d, --deepspeed_only    Install only deepspeed and no third party dependencies
    -t, --third_party_only  Install only third party dependencies and not deepspeed
20 21 22
    -l, --local_only        Install only on local machine
    -s, --pip_sudo          Run pip with sudo (default: no sudo)
    -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
23
    -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
J
Jeff Rasley 已提交
24
    -a, --apex_commit       Install a specific commit hash of apex, instead of the one deepspeed points to
J
Jeff Rasley 已提交
25
    -k, --skip_requirements Skip installing DeepSpeed requirements
J
Jeff Rasley 已提交
26 27 28 29 30 31 32 33 34
    -h, --help              This help text
  """
}

ds_only=0
tp_only=0
deepspeed_install=1
third_party_install=1
local_only=0
35
pip_sudo=0
J
Jeff Rasley 已提交
36
entire_dlts_job=1
37
hostfile=/job/hostfile
38
pip_mirror=""
J
Jeff Rasley 已提交
39
apex_commit=""
J
Jeff Rasley 已提交
40
skip_requirements=0
J
Jeff Rasley 已提交
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61

while [[ $# -gt 0 ]]
do
key="$1"
case $key in
    -d|--deepspeed_only)
    deepspeed_install=1;
    third_party_install=0;
    ds_only=1;
    shift
    ;;
    -t|--third_party_only)
    deepspeed_install=0;
    third_party_install=1;
    tp_only=1;
    shift
    ;;
    -l|--local_only)
    local_only=1;
    shift
    ;;
62 63 64 65 66 67 68 69 70
    -s|--pip_sudo)
    pip_sudo=1;
    shift
    ;;
    -m|--pip_mirror)
    pip_mirror=$2;
    shift
    shift
    ;;
J
Jeff Rasley 已提交
71 72 73 74 75
    -a|--apex_commit)
    apex_commit=$2;
    shift
    shift
    ;;
J
Jeff Rasley 已提交
76 77 78 79
    -k|--skip_requirements)
    skip_requirements=1;
    shift
    ;;
80 81 82 83 84 85 86 87 88
    -H|--hostfile)
    hostfile=$2
    if [ ! -f $2 ]; then
        echo "User provided hostfile does not exist at $hostfile, exiting"
        exit 1
    fi
    shift
    shift
    ;;
J
Jeff Rasley 已提交
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
    -h|--help)
    usage
    exit 0
    ;;
    *)
    echo "Unkown argument(s)"
    usage
    exit 1
    shift
    ;;
esac
done

if [ "$ds_only" == "1" ] && [ "$tp_only" == "1" ]; then
    echo "-d and -t are mutually exclusive, only choose one or none"
    usage
    exit 1
fi

echo "Updating git hash/branch info"
109 110 111
echo "git_hash = '$(git rev-parse --short HEAD)'" > deepspeed/git_version_info.py
echo "git_branch = '$(git rev-parse --abbrev-ref HEAD)'" >> deepspeed/git_version_info.py
cat deepspeed/git_version_info.py
J
Jeff Rasley 已提交
112

113 114 115 116 117 118 119 120 121 122 123 124 125
if [ "$pip_sudo" == "1" ]; then
  PIP_SUDO="sudo -H"
else
  PIP_SUDO=""
fi

if [ "$pip_mirror" != "" ]; then
  PIP_INSTALL="pip install -i $pip_mirror"
else
  PIP_INSTALL="pip install"
fi


126 127
if [ ! -f $hostfile ]; then
        echo "No hostfile exists at $hostfile, installing locally"
J
Jeff Rasley 已提交
128 129 130
        local_only=1
fi

J
Jeff Rasley 已提交
131 132 133 134
if [ "$skip_requirements" == "0" ]; then
    # Ensure dependencies are installed locally
    $PIP_SUDO $PIP_INSTALL -r requirements.txt
fi
135

136 137 138 139 140 141 142
# Build wheels
if [ "$third_party_install" == "1" ]; then
    echo "Checking out sub-module(s)"
    git submodule update --init --recursive

    echo "Building apex wheel"
    cd third_party/apex
J
Jeff Rasley 已提交
143 144 145 146 147 148 149

    if [ "$apex_commit" != "" ]; then
        echo "Installing a non-standard version of apex at commit: $apex_commit"
        git fetch
        git checkout $apex_commit
    fi

150 151
    python setup.py --cpp_ext --cuda_ext bdist_wheel
    cd -
152 153

    echo "Installing apex locally so that deepspeed will build"
154 155
    $PIP_SUDO pip uninstall -y apex
    $PIP_SUDO $PIP_INSTALL third_party/apex/dist/apex*.whl
156 157
fi
if [ "$deepspeed_install" == "1" ]; then
158
    echo "Building deepspeed wheel"
159 160
    python setup.py bdist_wheel
fi
J
Jeff Rasley 已提交
161

162
if [ "$local_only" == "1" ]; then
J
Jeff Rasley 已提交
163 164
    if [ "$deepspeed_install" == "1" ]; then
        echo "Installing deepspeed"
165 166
        $PIP_SUDO pip uninstall -y deepspeed
        $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
J
Jeff Rasley 已提交
167 168 169 170 171 172 173 174 175 176 177 178
        python -c 'import deepspeed; print("deepspeed info:", deepspeed.__version__, deepspeed.__git_branch__, deepspeed.__git_hash__)'
        echo "Installation is successful"
    fi
else
    local_path=`pwd`
    if [ -f $hostfile ]; then
        hosts=`cat $hostfile | awk '{print $1}' | paste -sd "," -`;
    else
        echo "hostfile not found, cannot proceed"
        exit 1
    fi
    export PDSH_RCMD_TYPE=ssh;
179
    tmp_wheel_path="/tmp/deepspeed_wheels"
J
Jeff Rasley 已提交
180

181
    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; else mkdir -pv $tmp_wheel_path; fi"
182
    pdcp -w $hosts requirements.txt ${tmp_wheel_path}/
J
Jeff Rasley 已提交
183 184 185
    if [ "$skip_requirements" == "0" ]; then
        pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt"
    fi
J
Jeff Rasley 已提交
186
    if [ "$third_party_install" == "1" ]; then
187
        pdsh -w $hosts "$PIP_SUDO pip uninstall -y apex"
188
        pdcp -w $hosts third_party/apex/dist/apex*.whl $tmp_wheel_path/
189
        pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/apex*.whl"
J
Jeff Rasley 已提交
190 191 192 193
        pdsh -w $hosts 'python -c "import apex"'
    fi
    if [ "$deepspeed_install" == "1" ]; then
        echo "Installing deepspeed"
194
        pdsh -w $hosts "$PIP_SUDO pip uninstall -y deepspeed"
195
        pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
196
        pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
J
Jeff Rasley 已提交
197 198 199
        pdsh -w $hosts "python -c 'import deepspeed; print(\"deepspeed info:\", deepspeed.__version__, deepspeed.__git_branch__, deepspeed.__git_hash__)'"
        echo "Installation is successful"
    fi
200
    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl $tmp_wheel_path/requirements.txt; rmdir $tmp_wheel_path; fi"
J
Jeff Rasley 已提交
201
fi