diff --git a/benchmark/cluster/vgg16/run_vgg_dist.sh b/benchmark/cluster/vgg16/run_vgg_dist.sh new file mode 100644 index 0000000000000000000000000000000000000000..8c0501439e9d5fa175f5aa9b62d286e690a10904 --- /dev/null +++ b/benchmark/cluster/vgg16/run_vgg_dist.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Update to point to the source file. +VGG_SRC="vgg16_fluid.py" + +export TRAINING_ROLE=PSERVER +export TRAINERS=2 +export POD_IP=127.0.0.1 +export PADDLE_INIT_PORT=6174 +MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 & + +# Need to wait for the ps to start first. +sleep 10 +echo "done start ps" + +export TRAINING_ROLE=TRAINER +export TRAINERS=2 +export POD_IP=127.0.0.1 +export PADDLE_INIT_PORT=6174 +CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 & +CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 & diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py index 12c739480801a7f422773be6dad7dfe002999284..6c47f6535c3cfc31ba165ccec07ac95a9bfd0a7f 100644 --- a/benchmark/cluster/vgg16/vgg16_fluid.py +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -11,25 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""VGG16 benchmark in Fluid - -# Single trainer, single PS on a single machine. -VGG_SRC="${CODE_DIR}/vgg16_fluid.py" -export TRAINING_ROLE=PSERVER -export TRAINERS=1 -export POD_IP=127.0.0.1 -export PADDLE_INIT_PORT=6174 -MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 & -sleep 10 # wait for PS to start. -export TRAINING_ROLE=TRAINER -MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU & - -# To run multiple trainers on a single machine -# change TRAINERS=2 and launch 2 trainers. -# CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 & -# CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 & -""" - +"""VGG16 benchmark in Fluid""" from __future__ import print_function import sys