diff --git a/benchmark/cluster/vgg16/fluid/Dockerfile b/benchmark/cluster/vgg16/fluid/Dockerfile index 77cd17f2b912b22eb7371c93e0e46e033a8b78f8..711076b09e316292007acc40bedc1987d06c0065 100644 --- a/benchmark/cluster/vgg16/fluid/Dockerfile +++ b/benchmark/cluster/vgg16/fluid/Dockerfile @@ -3,10 +3,13 @@ #ADD reader.py /workspace/ #RUN python /workspace/reader.py FROM python:2.7.14 -ADD *.whl / -RUN pip install /*.whl && rm -f /*.whl ADD paddle_k8s /usr/bin ADD k8s_tools.py /root RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev +ADD *.whl / +RUN pip install /*.whl && rm -f /*.whl +ENV LD_LIBRARY_PATH=/usr/local/lib +ADD reader.py /workspace/ +RUN python /workspace/reader.py ADD vgg16.py /workspace/ diff --git a/benchmark/cluster/vgg16/fluid/README.md b/benchmark/cluster/vgg16/fluid/README.md new file mode 100644 index 0000000000000000000000000000000000000000..63a460f7a6200ceb4731f409c3745fd8208ce054 --- /dev/null +++ b/benchmark/cluster/vgg16/fluid/README.md @@ -0,0 +1,15 @@ +# Fluid distributed training perf test + +## Steps to get started + +1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support. +1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory. +1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it. +1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step). +1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers. + +Check the logs for the distributed training progress and analyze the performance. + +## Enable verbos logs + +Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail. \ No newline at end of file diff --git a/benchmark/cluster/vgg16/fluid/paddle_k8s b/benchmark/cluster/vgg16/fluid/paddle_k8s index 8f1c5db717e6fdf4869c2f9f20cf98d11b19052c..af5f35b3eca75f84e1e17ac5701964ea5e02f224 100755 --- a/benchmark/cluster/vgg16/fluid/paddle_k8s +++ b/benchmark/cluster/vgg16/fluid/paddle_k8s @@ -61,7 +61,6 @@ start_fluid_process() { if [ "${TRAINING_ROLE}" == "TRAINER" ]; then check_failed_cnt ${TRAINERS} sleep 5 - stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-master=${PADDLE_JOB_NAME} 1 export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id) fi export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips) diff --git a/benchmark/cluster/vgg16/fluid/pserver.yaml b/benchmark/cluster/vgg16/fluid/pserver.yaml index 47d2380d2eaa0008d2edbcc85f1d76b126c356fd..e1a58260af0325a313934cfa3730801190cadcce 100644 --- a/benchmark/cluster/vgg16/fluid/pserver.yaml +++ b/benchmark/cluster/vgg16/fluid/pserver.yaml @@ -33,7 +33,7 @@ spec: - name: TOPOLOGY value: "" - name: ENTRY - value: "MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0" + value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0" - name: TRAINER_PACKAGE value: "/workspace" - name: PADDLE_INIT_PORT diff --git a/benchmark/cluster/vgg16/fluid/trainer.yaml b/benchmark/cluster/vgg16/fluid/trainer.yaml index bada190764aa238549fb85e8b27031d4a4f98c61..c8e26d4b511f4f659fc08229cb463bd77a6f724b 100644 --- a/benchmark/cluster/vgg16/fluid/trainer.yaml +++ b/benchmark/cluster/vgg16/fluid/trainer.yaml @@ -17,7 +17,7 @@ spec: - name: trainer image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid" imagePullPolicy: Always - command: ["paddle_k8s", "start_trainer", "v2"] + command: ["paddle_k8s", "start_fluid"] env: - name: PADDLE_JOB_NAME value: vgg16job @@ -30,7 +30,7 @@ spec: - name: TOPOLOGY value: "" - name: ENTRY - value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py" + value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0" - name: TRAINER_PACKAGE value: "/workspace" - name: PADDLE_INIT_PORT diff --git a/benchmark/cluster/vgg16/fluid/vgg16.py b/benchmark/cluster/vgg16/fluid/vgg16.py index 0595a28784b9c6fbcd81a2362ab1cfc05408d192..a973f9d2a697c8658d401728137c1de28c30f3b5 100644 --- a/benchmark/cluster/vgg16/fluid/vgg16.py +++ b/benchmark/cluster/vgg16/fluid/vgg16.py @@ -140,12 +140,14 @@ def main(): def train_loop(exe, trainer_prog): iters = 0 + ts = time.time() for pass_id in range(args.num_passes): # train start_time = time.time() num_samples = 0 accuracy.reset(exe) for batch_id, data in enumerate(train_reader()): + ts = time.time() img_data = np.array(map(lambda x: x[0].reshape(data_shape), data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") @@ -158,8 +160,8 @@ def main(): iters += 1 num_samples += len(data) print( - "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" % - (pass_id, iters, loss, acc) + "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" % + (pass_id, iters, loss, acc, time.time() - ts) ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time