From cb34f6a230bf51cc6cb0b8b2ef93b3e13ed3f516 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 22 Jan 2018 14:45:06 +0800
Subject: [PATCH] update fluid vgg16 and add readme

---
 benchmark/cluster/vgg16/fluid/Dockerfile   |  7 +++++--
 benchmark/cluster/vgg16/fluid/README.md    | 15 +++++++++++++++
 benchmark/cluster/vgg16/fluid/paddle_k8s   |  1 -
 benchmark/cluster/vgg16/fluid/pserver.yaml |  2 +-
 benchmark/cluster/vgg16/fluid/trainer.yaml |  4 ++--
 benchmark/cluster/vgg16/fluid/vgg16.py     |  6 ++++--
 6 files changed, 27 insertions(+), 8 deletions(-)
 create mode 100644 benchmark/cluster/vgg16/fluid/README.md

diff --git a/benchmark/cluster/vgg16/fluid/Dockerfile b/benchmark/cluster/vgg16/fluid/Dockerfile
index 77cd17f2b91..711076b09e3 100644
--- a/benchmark/cluster/vgg16/fluid/Dockerfile
+++ b/benchmark/cluster/vgg16/fluid/Dockerfile
@@ -3,10 +3,13 @@
 #ADD reader.py /workspace/
 #RUN python /workspace/reader.py
 FROM python:2.7.14
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
 ADD paddle_k8s /usr/bin
 ADD k8s_tools.py /root
 RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev 
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD reader.py /workspace/
+RUN python /workspace/reader.py
 
 ADD vgg16.py /workspace/
diff --git a/benchmark/cluster/vgg16/fluid/README.md b/benchmark/cluster/vgg16/fluid/README.md
new file mode 100644
index 00000000000..63a460f7a62
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid/README.md
@@ -0,0 +1,15 @@
+# Fluid distributed training perf test
+
+## Steps to get started
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable verbos logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
\ No newline at end of file
diff --git a/benchmark/cluster/vgg16/fluid/paddle_k8s b/benchmark/cluster/vgg16/fluid/paddle_k8s
index 8f1c5db717e..af5f35b3eca 100755
--- a/benchmark/cluster/vgg16/fluid/paddle_k8s
+++ b/benchmark/cluster/vgg16/fluid/paddle_k8s
@@ -61,7 +61,6 @@ start_fluid_process() {
   if [ "${TRAINING_ROLE}" == "TRAINER" ]; then
     check_failed_cnt ${TRAINERS}
     sleep 5
-    stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
     export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
   fi
   export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
diff --git a/benchmark/cluster/vgg16/fluid/pserver.yaml b/benchmark/cluster/vgg16/fluid/pserver.yaml
index 47d2380d2ea..e1a58260af0 100644
--- a/benchmark/cluster/vgg16/fluid/pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid/pserver.yaml
@@ -33,7 +33,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
diff --git a/benchmark/cluster/vgg16/fluid/trainer.yaml b/benchmark/cluster/vgg16/fluid/trainer.yaml
index bada190764a..c8e26d4b511 100644
--- a/benchmark/cluster/vgg16/fluid/trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid/trainer.yaml
@@ -17,7 +17,7 @@ spec:
       - name: trainer
         image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
         imagePullPolicy: Always
-        command: ["paddle_k8s", "start_trainer", "v2"]
+        command: ["paddle_k8s", "start_fluid"]
         env:
         - name: PADDLE_JOB_NAME
           value: vgg16job
@@ -30,7 +30,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
+          value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
diff --git a/benchmark/cluster/vgg16/fluid/vgg16.py b/benchmark/cluster/vgg16/fluid/vgg16.py
index 0595a28784b..a973f9d2a69 100644
--- a/benchmark/cluster/vgg16/fluid/vgg16.py
+++ b/benchmark/cluster/vgg16/fluid/vgg16.py
@@ -140,12 +140,14 @@ def main():
 
     def train_loop(exe, trainer_prog):
         iters = 0
+        ts = time.time()
         for pass_id in range(args.num_passes):
             # train
             start_time = time.time()
             num_samples = 0
             accuracy.reset(exe)
             for batch_id, data in enumerate(train_reader()):
+                ts = time.time()
                 img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                         data)).astype("float32")
                 y_data = np.array(map(lambda x: x[1], data)).astype("int64")
@@ -158,8 +160,8 @@ def main():
                 iters += 1
                 num_samples += len(data)
                 print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
-                    (pass_id, iters, loss, acc)
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" %
+                    (pass_id, iters, loss, acc, time.time() - ts)
                 )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
-- 
GitLab