From ed365919b409749d903a2a5b4fbc4d3b00bb6f7c Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Fri, 1 Jun 2018 14:46:19 +0800 Subject: [PATCH] Add fluid benchmark Dockerfile (#11095) * add fluid benchmark Dockerfile * add_fluid_benchmark_dockerfile --- benchmark/fluid/Dockerfile | 22 ++++++++++++++++++++++ benchmark/fluid/README.md | 16 +++++++++++++++- benchmark/fluid/run.sh | 26 ++++++++++++++------------ 3 files changed, 51 insertions(+), 13 deletions(-) create mode 100644 benchmark/fluid/Dockerfile diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile new file mode 100644 index 0000000000..46140a9d1b --- /dev/null +++ b/benchmark/fluid/Dockerfile @@ -0,0 +1,22 @@ +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 +RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop +RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so +RUN pip install -U pip +RUN pip install -U kubernetes opencv-python paddlepaddle + +# IMPORTANT: +# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime. + +RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python' +RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python' +RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python' +RUN pip uninstall -y paddlepaddle && mkdir /workspace + +ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin +ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root + +ADD *.whl / +RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s + +ENV LD_LIBRARY_PATH=/usr/local/lib +ADD fluid_benchmark.py dataset.py models/ /workspace/ diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md index 7071e9fdcd..1b0c7dce8b 100644 --- a/benchmark/fluid/README.md +++ b/benchmark/fluid/README.md @@ -44,11 +44,25 @@ Currently supported `--model` argument include: ## Run Distributed Benchmark on Kubernetes Cluster +You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will +have to start all those processes mannually on each node, which is not recommended. + +To build the Docker image, you need to choose a paddle "whl" package to run with, you may either +download it from +http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or +build it by your own. Once you've got the "whl" package, put it under the current directory and run: + +```bash +docker build -t [your docker image name]:[your docker image tag] . +``` + +Then push the image to a Docker registry that your Kubernetes cluster can reach. + We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit distributed benchmark jobs to your cluster. To generate a job yaml, just run: ```bash -python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver +python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver ``` Then the yaml files are generated under directory `myjob`, you can run: diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh index f6dfd20bf2..afaab5f4de 100644 --- a/benchmark/fluid/run.sh +++ b/benchmark/fluid/run.sh @@ -37,7 +37,8 @@ nohup stdbuf -oL nvidia-smi \ -l 1 & # mnist # mnist gpu mnist 128 -FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \ +FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ + --model=mnist \ --device=GPU \ --batch_size=128 \ --skip_batch_num=5 \ @@ -46,7 +47,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \ # vgg16 # gpu cifar10 128 -FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ +FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ + --model=vgg16 \ --device=GPU \ --batch_size=128 \ --skip_batch_num=5 \ @@ -54,7 +56,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ 2>&1 | tee -a vgg16_gpu_128.log # flowers gpu 128 -FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ +FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ + --model=vgg16 \ --device=GPU \ --batch_size=32 \ --data_set=flowers \ @@ -64,40 +67,39 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ # resnet50 # resnet50 gpu cifar10 128 -FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ +FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ + --model=resnet50 \ --device=GPU \ --batch_size=128 \ --data_set=cifar10 \ - --model=resnet_cifar10 \ --skip_batch_num=5 \ --iterations=30 \ 2>&1 | tee -a resnet50_gpu_128.log # resnet50 gpu flowers 64 -FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ +FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ + --model=resnet50 \ --device=GPU \ --batch_size=64 \ --data_set=flowers \ - --model=resnet_imagenet \ --skip_batch_num=5 \ --iterations=30 \ 2>&1 | tee -a resnet50_gpu_flowers_64.log # lstm # lstm gpu imdb 32 # tensorflow only support batch=32 -FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \ +FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ + --model=stacked_dynamic_lstm \ --device=GPU \ --batch_size=32 \ --skip_batch_num=5 \ --iterations=30 \ - --hidden_dim=512 \ - --emb_dim=512 \ - --crop_size=1500 \ 2>&1 | tee -a lstm_gpu_32.log # seq2seq # seq2seq gpu wmb 128 -FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \ +FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ + --model=machine_translation \ --device=GPU \ --batch_size=128 \ --skip_batch_num=5 \ -- GitLab