update fluid vgg16 and add readme

cb34f6a2 · typhoonzero · d3905fbc · cb34f6a2 · cb34f6a2 · cb34f6a2
6 changed file
--- a/benchmark/cluster/vgg16/fluid/Dockerfile
+++ b/benchmark/cluster/vgg16/fluid/Dockerfile
@@ -3,10 +3,13 @@
 #ADD reader.py /workspace/
 #RUN python /workspace/reader.py
 FROM python:2.7.14
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
 ADD paddle_k8s /usr/bin
 ADD k8s_tools.py /root
 RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev 
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD reader.py /workspace/
+RUN python /workspace/reader.py
 ADD vgg16.py /workspace/
--- a/benchmark/cluster/vgg16/fluid/README.md
+++ b/benchmark/cluster/vgg16/fluid/README.md
+# Fluid distributed training perf test
+## Steps to get started
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+Check the logs for the distributed training progress and analyze the performance.
+## Enable verbos logs
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
\ No newline at end of file
--- a/benchmark/cluster/vgg16/fluid/paddle_k8s
+++ b/benchmark/cluster/vgg16/fluid/paddle_k8s
@@ -61,7 +61,6 @@ start_fluid_process() {
  if [ "${TRAINING_ROLE}" == "TRAINER" ]; then
    check_failed_cnt ${TRAINERS}
    sleep 5
-    stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
    export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
  fi
  export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)

--- a/benchmark/cluster/vgg16/fluid/pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid/pserver.yaml
@@ -33,7 +33,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT

--- a/benchmark/cluster/vgg16/fluid/trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid/trainer.yaml
@@ -17,7 +17,7 @@ spec:
      - name: trainer
        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_trainer", "v2"]
+        command: ["paddle_k8s", "start_fluid"]
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16job
@@ -30,7 +30,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
+          value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT

--- a/benchmark/cluster/vgg16/fluid/vgg16.py
+++ b/benchmark/cluster/vgg16/fluid/vgg16.py
@@ -140,12 +140,14 @@ def main():
    def train_loop(exe, trainer_prog):
        iters = 0
+        ts = time.time()
        for pass_id in range(args.num_passes):
            # train
            start_time = time.time()
            num_samples = 0
            accuracy.reset(exe)
            for batch_id, data in enumerate(train_reader()):
+                ts = time.time()
                img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                        data)).astype("float32")
                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
@@ -158,8 +160,8 @@ def main():
                iters += 1
                num_samples += len(data)
                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" %
-                    (pass_id, iters, loss, acc)
+                    (pass_id, iters, loss, acc, time.time() - ts)
                )  # The accuracy is the accumulation of batches, but not the current batch.
            pass_elapsed = time.time() - start_time