From ed365919b409749d903a2a5b4fbc4d3b00bb6f7c Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Fri, 1 Jun 2018 14:46:19 +0800
Subject: [PATCH] Add fluid benchmark Dockerfile (#11095)

* add fluid benchmark Dockerfile

* add_fluid_benchmark_dockerfile
---
 benchmark/fluid/Dockerfile | 22 ++++++++++++++++++++++
 benchmark/fluid/README.md  | 16 +++++++++++++++-
 benchmark/fluid/run.sh     | 26 ++++++++++++++------------
 3 files changed, 51 insertions(+), 13 deletions(-)
 create mode 100644 benchmark/fluid/Dockerfile

diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
new file mode 100644
index 00000000000..46140a9d1be
--- /dev/null
+++ b/benchmark/fluid/Dockerfile
@@ -0,0 +1,22 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
+RUN pip install -U pip
+RUN pip install -U kubernetes opencv-python paddlepaddle
+
+# IMPORTANT:
+# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
+RUN pip uninstall -y paddlepaddle && mkdir /workspace
+
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD fluid_benchmark.py dataset.py models/ /workspace/
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index 7071e9fdcd3..1b0c7dce8bd 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -44,11 +44,25 @@ Currently supported `--model` argument include:
 
 ## Run Distributed Benchmark on Kubernetes Cluster
 
+You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
+have to start all those processes mannually on each node, which is not recommended.
+
+To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
+download it from
+http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
+build it by your own. Once you've got the "whl" package, put it under the current directory and run:
+
+```bash
+docker build -t [your docker image name]:[your docker image tag] .
+```
+
+Then push the image to a Docker registry that your Kubernetes cluster can reach.
+
 We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
 distributed benchmark jobs to your cluster. To generate a job yaml, just run:
 
 ```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
 ```
 
 Then the yaml files are generated under directory `myjob`, you can run:
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
index f6dfd20bf2e..afaab5f4de4 100644
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -37,7 +37,8 @@ nohup stdbuf -oL nvidia-smi \
       -l 1 &
 # mnist
 # mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=mnist \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
@@ -46,7 +47,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
 
 # vgg16
 # gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
@@ -54,7 +56,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
                2>&1 | tee -a vgg16_gpu_128.log
 
 # flowers gpu  128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
                --device=GPU \
                --batch_size=32 \
                --data_set=flowers \
@@ -64,40 +67,39 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
 
 # resnet50
 # resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet50 \
                --device=GPU \
                --batch_size=128 \
                --data_set=cifar10 \
-               --model=resnet_cifar10 \
                --skip_batch_num=5 \
                --iterations=30 \
                2>&1 | tee -a resnet50_gpu_128.log
 
 # resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet50 \
                --device=GPU \
                --batch_size=64 \
                --data_set=flowers \
-               --model=resnet_imagenet \
                --skip_batch_num=5 \
                --iterations=30 \
                2>&1 | tee -a resnet50_gpu_flowers_64.log
 
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=stacked_dynamic_lstm \
                --device=GPU \
                --batch_size=32 \
                --skip_batch_num=5 \
                --iterations=30 \
-               --hidden_dim=512 \
-               --emb_dim=512 \
-               --crop_size=1500 \
                2>&1 | tee -a lstm_gpu_32.log
 
 # seq2seq
 # seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=machine_translation \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
-- 
GitLab