diff --git a/benchmark/cluster/vgg16/run_vgg_dist.sh b/benchmark/cluster/vgg16/run_vgg_dist.sh new file mode 100644 index 0000000000000000000000000000000000000000..8c0501439e9d5fa175f5aa9b62d286e690a10904 --- /dev/null +++ b/benchmark/cluster/vgg16/run_vgg_dist.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Update to point to the source file. +VGG_SRC="vgg16_fluid.py" + +export TRAINING_ROLE=PSERVER +export TRAINERS=2 +export POD_IP=127.0.0.1 +export PADDLE_INIT_PORT=6174 +MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 & + +# Need to wait for the ps to start first. +sleep 10 +echo "done start ps" + +export TRAINING_ROLE=TRAINER +export TRAINERS=2 +export POD_IP=127.0.0.1 +export PADDLE_INIT_PORT=6174 +CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 & +CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 & diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py index 8b29227cfab2a36d5b9f6d17b837b33da8d2a92e..6c47f6535c3cfc31ba165ccec07ac95a9bfd0a7f 100644 --- a/benchmark/cluster/vgg16/vgg16_fluid.py +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -200,18 +200,19 @@ def main(): num_samples += len(data) train_pass_acc.add(value=acc, weight=b_size) print( - "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s" - % (pass_id, iters, loss, acc, - len(data) / (time.time() - ts)) + "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " + "Speed = %.2f img/s " % (args.task_index, pass_id, iters, + loss, acc, + len(data) / (time.time() - ts)) ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time pass_train_acc = train_pass_acc.eval() pass_test_acc = test(exe) - print( - "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" - % (pass_id, num_samples / pass_elapsed, pass_train_acc, - pass_test_acc)) + print("Task:%d Pass = %d, Training performance = %f imgs/s, " + "Train accuracy = %f, Test accuracy = %f\n" % + (args.task_index, pass_id, num_samples / pass_elapsed, + pass_train_acc, pass_test_acc)) if args.local: # Parameter initialization