Merge pull request #10128 from panyx0718/dist0423

add some instructions for running vgg distributedly

Merge pull request #10128 from panyx0718/dist0423
add some instructions for running vgg distributedly
7c90d7a3 · Xin Pan · GitHub · c7e23bb7 · f0917740 · 7c90d7a3
隐藏空白更改
内联并排

Showing with 29 addition and 7 deletion

benchmark/cluster/vgg16/run_vgg_dist.sh benchmark/cluster/vgg16/run_vgg_dist.sh +21 -0

benchmark/cluster/vgg16/vgg16_fluid.py benchmark/cluster/vgg16/vgg16_fluid.py +8 -7

未找到文件。
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ b/benchmark/cluster/vgg16/run_vgg_dist.sh
+#!/bin/bash
+# Update to point to the source file.
+VGG_SRC="vgg16_fluid.py"
+export TRAINING_ROLE=PSERVER
+export TRAINERS=2
+export POD_IP=127.0.0.1
+export PADDLE_INIT_PORT=6174
+MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
+# Need to wait for the ps to start first.
+sleep 10
+echo "done start ps"
+export TRAINING_ROLE=TRAINER
+export TRAINERS=2
+export POD_IP=127.0.0.1
+export PADDLE_INIT_PORT=6174
+CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
+CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -200,18 +200,19 @@ def main():
                num_samples += len(data)
                train_pass_acc.add(value=acc, weight=b_size)
                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+                    "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    % (pass_id, iters, loss, acc,
+                    "Speed = %.2f img/s " % (args.task_index, pass_id, iters,
-                       len(data) / (time.time() - ts))
+                                             loss, acc,
+                                             len(data) / (time.time() - ts))
                )  # The accuracy is the accumulation of batches, but not the current batch.
            pass_elapsed = time.time() - start_time
            pass_train_acc = train_pass_acc.eval()
            pass_test_acc = test(exe)
-            print(
+            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
-                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
+                  "Train accuracy = %f, Test accuracy = %f\n" %
-                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
+                  (args.task_index, pass_id, num_samples / pass_elapsed,
-                   pass_test_acc))
+                   pass_train_acc, pass_test_acc))
    if args.local:
        # Parameter initialization