......@@ -36,7 +36,7 @@ RUN apt-get install -y python3 python3-dev && \
# TensorFlow
RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION}
......@@ -34,7 +34,7 @@ model scientists also need to set up the state-of-the-art training techniques
such as distributed training, mixed precision, gradient accumulation, and
checkpointing. Yet still, scientists may not achieve the desired system
performance and convergence rate. Large model sizes are even more challenging:
a large model easily runs out of memory with pure data parallelism and it is
difficult to use model parallelism. DeepSpeed addresses these challenges to
accelerate model development *and* training.
# Tutorial: CIFAR-10 with DeepSpeed
If you haven't already, we advise you to first read through the [Getting
Started](../../README.md#getting-started) guide before stepping through this
In this tutorial we will be adding DeepSpeed to CIFAR-10 model, which is small image classification model.
......@@ -103,16 +103,19 @@ if [ "$third_party_install" == "1" ]; then
cd third_party/apex
python setup.py --cpp_ext --cuda_ext bdist_wheel
cd -
echo "Installing apex locally so that deepspeed will build"
sudo -H pip uninstall -y apex
sudo -H pip install third_party/apex/dist/apex*.whl
if [ "$deepspeed_install" == "1" ]; then
echo "Installing deepspeed"
python setup.py bdist_wheel
if [ "$local_only" == "1" ]; then
if [ "$third_party_install" == "1" ]; then
echo "Installing apex"
echo "Installing apex locally"
sudo -H pip uninstall -y apex
sudo -H pip install third_party/apex/dist/apex*.whl
