diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md deleted file mode 100644 index 64816098a524f064ec12474a736cd4c721227a70..0000000000000000000000000000000000000000 --- a/benchmark/cluster/README.md +++ /dev/null @@ -1,196 +0,0 @@ -# Cluster Training Benchmark - -## Setup - -- Platform - - Kubernetes: v1.6.2 - - Linux Kernel: v3.10.0 - -- Resource - - CPU: 10 Cores per Pod - - Memory: 5GB per Pod - -- Docker Image - - We use different base Docker Image to run the benchmark on Kubernetes: - - PaddlePaddle v2: paddlepaddle/paddle:0.11.0 - - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id] - - TensorFlow: tensorflow/tensorflow:1.5.0-rc0 - -- Model - vgg16 is used in this benchmark. - -## Cases - -- Variable - - Batch Size of training data. - - PServer count of the training job. - - The number of trainers. - -- Invariant - - The resource of trainer/pserver Pod. - -### Measure the Performance for Different Batch Size - -- PServer Count: 40 -- Trainer Count: 100 -- Metrics: mini-batch / sec - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Batch Size 3264128 256
PaddlePaddle Fluid-- - -
PaddlePaddle v2 - - - -
TensorFlow - - - -
- -### Measure the Performance for Different PServer Count - -- Trainer Count: 100 -- Batch Size: 64 -- Metrics: mini-batch / sec - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PServer Count 102040 60
PaddlePaddle Fluid-- - -
PaddlePaddle v2 - - - -
TensorFlow - - - -
- -### Measure Parallel Efficiency By Increasing Trainer Count - -- PServer Count: 20 -- Batch Size: 64 -- Metrics: - -$S = \div(T1, TN)$ - -which S is the ratio of T1 over TN, training time of 1 and N trainers. -The parallel efficiency is: - -$E = \div(S, N)$ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Trainer Counter 11020 30405060 708090100
PaddlePaddle Fluid-- - - -- - - -- -
PaddlePaddle v2 - - - - -- - - -- -
TensorFlow - - - - -- - - -- -
- - -## Reproduce the benchmark - -TODO diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile deleted file mode 100644 index 13ad8e1b6237e6f41a076c4fb54311728832ae33..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/Dockerfile +++ /dev/null @@ -1,35 +0,0 @@ -FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04 - -# you can get mirror list here: -# https://launchpad.net/ubuntu/+archivemirrors -ARG UBUNTU_MIRROR -RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' - -RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev -RUN pip install -U kubernetes opencv-python - -RUN pip install paddlepaddle -# if network is slowly, you may need to add proxy here. -# ENV https_proxy= -RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python' -RUN pip uninstall -y paddlepaddle -# unset proxy if it is setted. -# ENV https_proxy="" - -# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF, -# so we must build one with distribute support to install in this image. -ADD *.whl / -RUN pip install /*.whl && rm -f /*.whl -ENV LD_LIBRARY_PATH=/usr/local/lib - -# tf k8s -RUN pip install tensorflow==1.4.0 -ADD tf_k8s /usr/bin -RUN chmod +x /usr/bin/tf_k8s -ADD vgg16_tf.py /workspace/ - -# below lines may change a lot for debugging -ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin -ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root -RUN chmod +x /usr/bin/paddle_k8s -ADD vgg16_fluid.py vgg16_v2.py /workspace/ diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md deleted file mode 100644 index d56a912b9b03986e32693363f82df05a34b779e9..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/README.md +++ /dev/null @@ -1,195 +0,0 @@ -# Performance for Distributed vgg16 - -## Test Result - -### Hardware Infomation - -- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz -- cpu MHz : 2101.000 -- cache size : 20480 KB - -### Blas settings - -Setting environment variable: `MKL_NUM_THREADS=1`. - -### Single Node Single Thread - -- Metrics: samples / sec - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Batch Size 3264128 256
PaddlePaddle Fluid 15.44 16.32 16.74 16.79
PaddlePaddle v2 15.97 17.04 17.60 17.83
TensorFlow 9.09 9.10 9.24 8.66
- - -### Different Batch Size - -- PServer Count: 10 -- Trainer Count: 20 -- Metrics: samples / sec - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Batch Size 3264128 256
PaddlePaddle Fluid 190.20 222.15 247.40 258.18
PaddlePaddle v2 170.96 233.71 256.14 329.23
TensorFlow - - - -
- -### Accelerate Rate - -- Pserver Count: 20 -- Batch Size: 128 -- Metrics: samples / sec - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Trainer Count 204080100
PaddlePaddle Fluid 263.29 (78.64%) 518.80 (77.47%) 836.26 (62.44%) 1019.29 (60.89%)
PaddlePaddle v2 (need more tests) 326.85 (92.85%) 534.58 (75.93%) 853.30 (60.60%) 1041.99 (59.20%)
TensorFlow - - - -
- - -### Different Pserver Count - -- Trainer Count: 60 -- Batch Size: 128 -- Metrics: samples/ sec - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PServer Count 361020
PaddlePaddle Fluid(should fix in next PR) 589.1 592.6 656.4 655.8
PaddlePaddle v2 (need more tests) 593.4 791.3 729.7 821.7
TensorFlow - - - -
- - -*The performance gap between Fuild and v2 comes from the network interference.* - - -## Steps to Run the Performance Test - -1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support. -1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory. -1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it. -1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step). -1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers. - -Check the logs for the distributed training progress and analyze the performance. - -## Enable Verbos Logs - -Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail. diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml deleted file mode 100644 index ee8b0763b62fc011f40f6197e929a68b48a93e47..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/fluid_pserver.yaml +++ /dev/null @@ -1,72 +0,0 @@ -apiVersion: extensions/v1beta1 -kind: ReplicaSet -metadata: - name: vgg16job-pserver -spec: - replicas: 10 - template: - metadata: - labels: - paddle-job-pserver: vgg16job - spec: - hostNetwork: true - imagePullSecrets: - - name: job-registry-secret - containers: - - name: pserver - image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" - imagePullPolicy: Always - ports: - - name: jobport-30236 - containerPort: 30236 - env: - - name: PADDLE_JOB_NAME - value: vgg16job - - name: MKL_NUM_THREADS - value: "1" - - name: TRAINING_ROLE - value: "PSERVER" - - name: TRAINERS - value: "20" - - name: PSERVERS - value: "10" - - name: TOPOLOGY - value: "" - - name: ENTRY - value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0" - - name: TRAINER_PACKAGE - value: "/workspace" - - name: PADDLE_INIT_PORT - value: "30236" - - name: PADDLE_INIT_NICS - value: "xgbe0" - - name: PADDLE_INIT_TRAINER_COUNT - value: "1" - - name: PADDLE_INIT_PORTS_NUM - value: "1" - - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE - value: "1" - - name: PADDLE_INIT_NUM_GRADIENT_SERVERS - value: "20" - - name: PADDLE_INIT_NUM_PASSES - value: "1" - - name: PADDLE_INIT_USE_GPU - value: "0" - - name: LD_LIBRARY_PATH - value: "/usr/local/lib:/usr/local/nvidia/lib64" - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: "metadata.namespace" - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: "status.podIP" - command: ["paddle_k8s", "start_fluid"] - resources: - requests: - memory: 10Gi - cpu: 4 - limits: - memory: 10Gi - cpu: 4 diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml deleted file mode 100644 index 3d56caac009464d1073423bb63abff1f8b0cf28f..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/fluid_trainer.yaml +++ /dev/null @@ -1,69 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: vgg16job-trainer -spec: - parallelism: 20 - completions: 20 - template: - metadata: - labels: - paddle-job: vgg16job - spec: - imagePullSecrets: - - name: job-registry-secret - hostNetwork: true - containers: - - name: trainer - image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" - imagePullPolicy: Always - command: ["paddle_k8s", "start_fluid"] - env: - - name: PADDLE_JOB_NAME - value: vgg16job - - name: TRAINING_ROLE - value: "TRAINER" - - name: TRAINERS - value: "20" - - name: PSERVERS - value: "10" - - name: TOPOLOGY - value: "" - - name: ENTRY - value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128" - - name: TRAINER_PACKAGE - value: "/workspace" - - name: PADDLE_INIT_PORT - value: "30236" - - name: PADDLE_INIT_NICS - value: "xgbe0" - - name: PADDLE_INIT_TRAINER_COUNT - value: "1" - - name: PADDLE_INIT_PORTS_NUM - value: "1" - - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE - value: "1" - - name: PADDLE_INIT_NUM_GRADIENT_SERVERS - value: "20" - - name: PADDLE_INIT_NUM_PASSES - value: "1" - - name: PADDLE_INIT_USE_GPU - value: "0" - - name: LD_LIBRARY_PATH - value: "/usr/local/lib:/usr/local/nvidia/lib64" - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: "metadata.namespace" - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: "status.podIP" - resources: - requests: - memory: 40Gi - cpu: 2 - limits: - memory: 40Gi - cpu: 2 - restartPolicy: Never diff --git a/benchmark/cluster/vgg16/run_vgg_dist.sh b/benchmark/cluster/vgg16/run_vgg_dist.sh deleted file mode 100644 index 8c0501439e9d5fa175f5aa9b62d286e690a10904..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/run_vgg_dist.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -# Update to point to the source file. -VGG_SRC="vgg16_fluid.py" - -export TRAINING_ROLE=PSERVER -export TRAINERS=2 -export POD_IP=127.0.0.1 -export PADDLE_INIT_PORT=6174 -MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 & - -# Need to wait for the ps to start first. -sleep 10 -echo "done start ps" - -export TRAINING_ROLE=TRAINER -export TRAINERS=2 -export POD_IP=127.0.0.1 -export PADDLE_INIT_PORT=6174 -CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 & -CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 & diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s deleted file mode 100644 index 4fc263d5f681aeabfa71f1758714d269d987b272..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/tf_k8s +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/bash -check_trainer_ret() { - ret=$1 - stdbuf -oL echo "job returned $ret...setting pod return message..." - stdbuf -oL echo "===============================" - - if [ $ret -eq 136 ] ; then - echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log - elif [ $ret -eq 139 ] ; then - echo "Segmentation Fault" > /dev/termination-log - elif [ $ret -eq 1 ] ; then - echo "General Error" > /dev/termination-log - elif [ $ret -eq 134 ] ; then - echo "Program Abort" > /dev/termination-log - fi - stdbuf -oL echo "termination log wroted..." - exit $ret -} - -g_pservers="" -g_trainers="" - -wait_running_pods(){ - pserver_label="tf-job-pserver=${JOB_NAME}" - trainer_label="tf-job-trainer=${JOB_NAME}" - - stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM} - stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM} - - g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT}) - g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT}) -} - -start_tf_pserver(){ - wait_running_pods - - label="tf-job-pserver=${JOB_NAME}" - pserver_id=$(python /root/k8s_tools.py fetch_id ${label}) - - cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \ - --job_name=${TF_JOB_NAME} --task_index=${pserver_id}" - - stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}" -} - -start_tf_trainer(){ - wait_running_pods - - label="tf-job-trainer=${JOB_NAME}" - trainer_id=$(python /root/k8s_tools.py fetch_id ${label}) - - cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \ - --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}" - - stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}" - check_trainer_ret $? -} - -start_tf(){ - if [[ "${TF_JOB_NAME}" == "worker" ]]; then - start_tf_trainer - else - start_tf_pserver - fi -} - -usage() { - echo "usage: tf_k8s []:" - echo " start_tf Start tensorflow jobs" -} - -case "$1" in - start_tf) - start_tf - ;; - --help) - usage - ;; - *) - usage - ;; -esac diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml deleted file mode 100644 index 5e37c700819119c8af05c40fe4b8d13911efc3e1..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/tf_pserver.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: extensions/v1beta1 -kind: ReplicaSet -metadata: - name: vgg16job-tf-pserver -spec: - replicas: 10 - template: - metadata: - labels: - tf-job-pserver: vgg16job-tf - spec: - hostNetwork: true - imagePullSecrets: - - name: job-registry-secret - containers: - - name: pserver - image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16" - imagePullPolicy: Always - command: ["tf_k8s", "start_tf"] - ports: - - name: jobport-30236 - containerPort: 30236 - env: - - name: PORT - value: "32036" - - name: ENTRY - value: "python vgg16_tf.py" - - name: JOB_NAME - value: vgg16job-tf - - name: PSERVERS_NUM - value: "10" - - name: TF_JOB_NAME - value: "ps" - - name: TRAINERS_NUM - value: "20" - - name: BATCH_SIZE - value: "128" - - name: TRAINER_PACKAGE - value: "/workspace" - - name: NUM_PASSES - value: "1" - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: "metadata.namespace" - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: "status.podIP" - resources: - requests: - memory: 10Gi - cpu: 4 - limits: - memory: 10Gi - cpu: 4 diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml deleted file mode 100644 index 08795df3addfa7b618db24a65e57be190e268f06..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/tf_trainer.yaml +++ /dev/null @@ -1,58 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: vgg16job-tf-trainer -spec: - parallelism: 20 - completions: 20 - template: - metadata: - labels: - tf-job-trainer: vgg16job-tf - spec: - imagePullSecrets: - - name: job-registry-secret - hostNetwork: true - containers: - - name: trainer - image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16" - imagePullPolicy: Always - command: ["tf_k8s", "start_tf"] - ports: - - name: jobport-30236 - containerPort: 30236 - env: - - name: PORT - value: "32036" - - name: JOB_NAME - value: vgg16job-tf - - name: TF_JOB_NAME - value: "worker" - - name: ENTRY - value: "python vgg16_tf.py" - - name: PSERVERS_NUM - value: "10" - - name: BATCH_SIZE - value: "128" - - name: TRAINERS_NUM - value: "20" - - name: TRAINER_PACKAGE - value: "/workspace" - - name: NUM_PASSES - value: "1" - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: "metadata.namespace" - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: "status.podIP" - resources: - requests: - memory: 40Gi - cpu: 2 - limits: - memory: 40Gi - cpu: 2 - restartPolicy: Never diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml deleted file mode 100644 index dd1271e0cf399184134c06b3200ee1202c65cef0..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/v2_pserver.yaml +++ /dev/null @@ -1,64 +0,0 @@ -apiVersion: extensions/v1beta1 -kind: ReplicaSet -metadata: - name: vgg16v2job-pserver -spec: - replicas: 10 - template: - metadata: - labels: - paddle-job-pserver: vgg16v2job - spec: - hostNetwork: true - imagePullSecrets: - - name: job-registry-secret - containers: - - name: pserver - image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" - imagePullPolicy: Always - ports: - - name: jobport-30236 - containerPort: 30236 - env: - - name: PADDLE_JOB_NAME - value: vgg16v2job - - name: TRAINERS - value: "20" - - name: PSERVERS - value: "10" - - name: TOPOLOGY - value: "" - - name: ENTRY - value: "python train.py" - - name: TRAINER_PACKAGE - value: "/workspace" - - name: PADDLE_INIT_PORT - value: "30236" - - name: PADDLE_INIT_NICS - value: "xgbe0" - - name: PADDLE_INIT_TRAINER_COUNT - value: "1" - - name: PADDLE_INIT_PORTS_NUM - value: "1" - - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE - value: "1" - - name: PADDLE_INIT_NUM_GRADIENT_SERVERS - value: "20" - - name: PADDLE_INIT_NUM_PASSES - value: "1" - - name: PADDLE_INIT_USE_GPU - value: "0" - - name: LD_LIBRARY_PATH - value: "/usr/local/lib:/usr/local/nvidia/lib64" - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: "metadata.namespace" - command: ["paddle_k8s", "start_pserver"] - resources: - requests: - memory: 10Gi - cpu: 4 - limits: - memory: 10Gi - cpu: 4 diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml deleted file mode 100644 index 12c8964066cbcfe8d2a44de2f51a3d12ea422fe2..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/v2_trainer.yaml +++ /dev/null @@ -1,65 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: vgg16v2job-trainer -spec: - parallelism: 20 - completions: 20 - template: - metadata: - labels: - paddle-job: vgg16v2job - spec: - imagePullSecrets: - - name: job-registry-secret - hostNetwork: true - containers: - - name: trainer - image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" - imagePullPolicy: Always - command: ["paddle_k8s", "start_trainer", "v2"] - env: - - name: PADDLE_JOB_NAME - value: vgg16v2job - - name: BATCH_SIZE - value: "256" - - name: TRAINERS - value: "20" - - name: PSERVERS - value: "10" - - name: TOPOLOGY - value: "" - - name: ENTRY - value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py" - - name: TRAINER_PACKAGE - value: "/workspace" - - name: PADDLE_INIT_PORT - value: "30236" - - name: PADDLE_INIT_NICS - value: "xgbe0" - - name: PADDLE_INIT_TRAINER_COUNT - value: "1" - - name: PADDLE_INIT_PORTS_NUM - value: "1" - - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE - value: "1" - - name: PADDLE_INIT_NUM_GRADIENT_SERVERS - value: "20" - - name: PADDLE_INIT_NUM_PASSES - value: "2" - - name: PADDLE_INIT_USE_GPU - value: "0" - - name: LD_LIBRARY_PATH - value: "/usr/local/lib:/usr/local/nvidia/lib64" - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: "metadata.namespace" - resources: - requests: - memory: 40Gi - cpu: 2 - limits: - memory: 40Gi - cpu: 2 - restartPolicy: Never diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py deleted file mode 100644 index e9360ab4c79d23bdf9f84d0c0d407af6d39bde3e..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/vgg16_fluid.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""VGG16 benchmark in Fluid""" -from __future__ import print_function - -import sys -import time -import numpy as np -import paddle.v2 as paddle -import paddle.fluid as fluid -import paddle.fluid.core as core -import paddle.fluid.profiler as profiler -import argparse -import functools -import os -from paddle.fluid import debuger - - -def str2bool(v): - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') - - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - '--batch_size', type=int, default=16, help="Batch size for training.") -parser.add_argument( - '--learning_rate', - type=float, - default=1e-3, - help="Learning rate for training.") -parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.") -parser.add_argument( - '--device', - type=str, - default='CPU', - choices=['CPU', 'GPU'], - help="The device type.") -parser.add_argument('--device_id', type=int, default=0, help="The device id.") -parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data order, now only support NCHW.') -parser.add_argument( - '--data_set', - type=str, - default='flowers', - choices=['cifar10', 'flowers'], - help='Optional dataset for benchmark.') -parser.add_argument( - '--local', - type=str2bool, - default=True, - help='Whether to run as local mode.') - -parser.add_argument( - "--ps_hosts", - type=str, - default="", - help="Comma-separated list of hostname:port pairs") -parser.add_argument( - "--trainer_hosts", - type=str, - default="", - help="Comma-separated list of hostname:port pairs") -parser.add_argument( - "--profile", action='store_true', help="If set, profile a few steps.") - -# Flags for defining the tf.train.Server -parser.add_argument( - "--task_index", type=int, default=0, help="Index of task within the job") -args = parser.parse_args() - - -def vgg16_bn_drop(input): - def conv_block(input, num_filter, groups, dropouts): - return fluid.nets.img_conv_group( - input=input, - pool_size=2, - pool_stride=2, - conv_num_filter=[num_filter] * groups, - conv_filter_size=3, - conv_act='relu', - conv_with_batchnorm=True, - conv_batchnorm_drop_rate=dropouts, - pool_type='max') - - conv1 = conv_block(input, 64, 2, [0.3, 0]) - conv2 = conv_block(conv1, 128, 2, [0.4, 0]) - conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) - conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) - conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - - drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) - fc1 = fluid.layers.fc(input=drop, size=4096, act=None) - bn = fluid.layers.batch_norm(input=fc1, act='relu') - drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) - fc2 = fluid.layers.fc(input=drop2, size=4096, act=None) - return fc2 - - -def main(): - if args.data_set == "cifar10": - classdim = 10 - if args.data_format == 'NCHW': - data_shape = [3, 32, 32] - else: - data_shape = [32, 32, 3] - else: - classdim = 102 - if args.data_format == 'NCHW': - data_shape = [3, 224, 224] - else: - data_shape = [224, 224, 3] - - # Input data - images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - # Train program - net = vgg16_bn_drop(images) - predict = fluid.layers.fc(input=net, size=classdim, act='softmax') - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - # Evaluator - batch_size = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size) - - # inference program - inference_program = fluid.default_main_program().clone() - with fluid.program_guard(inference_program): - inference_program = fluid.io.get_inference_program(batch_acc) - - # Optimization - optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) - optimize_ops, params_grads = optimizer.minimize(avg_cost) - - # Initialize executor - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace( - args.device_id) - exe = fluid.Executor(place) - - # test - def test(exe): - test_pass_acc = fluid.average.WeightedAverage() - for batch_id, data in enumerate(test_reader()): - img_data = np.array(map(lambda x: x[0].reshape(data_shape), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([-1, 1]) - - outs = exe.run(inference_program, - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[batch_acc, batch_size]) - test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1])) - - return test_pass_acc.eval() - - def train_loop(exe, trainer_prog): - iters = 0 - ts = time.time() - train_pass_acc = fluid.average.WeightedAverage() - for pass_id in range(args.num_passes): - # train - start_time = time.time() - num_samples = 0 - train_pass_acc.reset() - - def run_step(batch_id, data): - img_data = np.array( - map(lambda x: x[0].reshape(data_shape), data)).astype( - "float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([-1, 1]) - - loss, acc, b_size = exe.run( - trainer_prog, - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[avg_cost, batch_acc, batch_size]) - return loss, acc, b_size - - if args.profile: - with profiler.profiler('All', 'total', - '/tmp/profile_vgg_%d' % args.task_index): - for batch_id, data in enumerate(train_reader()): - if batch_id > 5: break - run_step(batch_id, data) - - total_time = 0.0 - count = 0 - for batch_id, data in enumerate(train_reader()): - ts = time.time() - loss, acc, b_size = run_step(batch_id, data) - iters += 1 - num_samples += len(data) - train_pass_acc.add(value=acc, weight=b_size) - - duration = time.time() - ts - total_time += duration - count += len(data) - print( - "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " - "Speed = %.2f (%.2f) img/s" % (pass_id, iters, loss, acc, - len(data) / duration, - count / total_time) - ) # The accuracy is the accumulation of batches, but not the current batch. - - pass_elapsed = time.time() - start_time - pass_train_acc = train_pass_acc.eval() - pass_test_acc = test(exe) - print("Task:%d Pass = %d, Training performance = %f imgs/s, " - "Train accuracy = %f, Test accuracy = %f\n" % - (args.task_index, pass_id, num_samples / pass_elapsed, - pass_train_acc, pass_test_acc)) - - if args.local: - # Parameter initialization - exe.run(fluid.default_startup_program()) - - # data reader - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10() if args.data_set == 'cifar10' - else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.dataset.cifar.test10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), - batch_size=args.batch_size) - train_loop(exe, fluid.default_main_program()) - else: - trainers = int(os.getenv("TRAINERS")) # total trainer count - print("trainers total: ", trainers) - - training_role = os.getenv( - "TRAINING_ROLE", - "TRAINER") # get the training role: trainer/pserver - - t = fluid.DistributeTranspiler() - t.transpile( - trainer_id=args.task_index, - pservers=args.ps_hosts, - trainers=trainers) - - if training_role == "PSERVER": - current_endpoint = os.getenv("POD_IP") + ":" + os.getenv( - "PADDLE_INIT_PORT") - if not current_endpoint: - print("need env SERVER_ENDPOINT") - exit(1) - pserver_prog = t.get_pserver_program(current_endpoint) - pserver_startup = t.get_startup_program(current_endpoint, - pserver_prog) - exe.run(pserver_startup) - exe.run(pserver_prog) - elif training_role == "TRAINER": - # Parameter initialization - exe.run(fluid.default_startup_program()) - - # data reader - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10() if args.data_set == 'cifar10' - else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else - paddle.dataset.flowers.test(), - batch_size=args.batch_size) - - trainer_prog = t.get_trainer_program() - feeder = fluid.DataFeeder(feed_list=[images, label], place=place) - # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver - exe.run(fluid.default_startup_program()) - train_loop(exe, trainer_prog) - else: - print("environment var TRAINER_ROLE should be TRAINER os PSERVER") - - -def print_arguments(): - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == "__main__": - print_arguments() - main() diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py deleted file mode 100644 index 2d220478acae46566760209dbc012cff316946aa..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/vgg16_tf.py +++ /dev/null @@ -1,366 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""VGG16 benchmark in TensorFlow -You can get distribution example template structure here: -https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb -https://www.tensorflow.org/deploy/distributed -""" - -import tensorflow as tf -import paddle.v2 as paddle -import numpy as np -import argparse -import time - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - '--batch_size', type=int, default=128, help="Batch size for training.") -parser.add_argument( - '--learning_rate', - type=float, - default=1e-3, - help="Learning rate for training.") -parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.") -parser.add_argument( - '--device', - type=str, - default='CPU', - choices=['CPU', 'GPU'], - help="The device type.") -parser.add_argument( - '--data_format', - type=str, - default='NHWC', - choices=['NCHW', 'NHWC'], - help='The data order, NCHW=[batch, channels, height, width].' - 'Only support NHWC right now.') -parser.add_argument( - '--data_set', - type=str, - default='cifar10', - choices=['cifar10', 'flowers'], - help='Optional dataset for benchmark.') - -parser.add_argument( - "--ps_hosts", - type=str, - default="", - help="Comma-separated list of hostname:port pairs") -parser.add_argument( - "--worker_hosts", - type=str, - default="", - help="Comma-separated list of hostname:port pairs") -parser.add_argument( - "--job_name", type=str, default="", help="One of 'worker', 'ps'") -# Flags for defining the tf.train.Server -parser.add_argument( - "--task_index", type=int, default=0, help="Index of task within the job") - -args = parser.parse_args() - - -class VGG16Model(object): - def __init__(self): - self.parameters = [] - - def batch_norm_relu(self, inputs, is_training): - """Performs a batch normalization followed by a ReLU.""" - # We set fused=True for a significant speed boost. See - # https://www.tensorflow.org/speed/speed_guide#common_fused_ops - inputs = tf.layers.batch_normalization( - inputs=inputs, - axis=1 if args.data_format == 'NCHW' else -1, - momentum=0.9, - epsilon=1e-05, - center=True, - scale=True, - training=is_training, - fused=True) - inputs = tf.nn.relu(inputs) - return inputs - - def conv_bn_layer(self, - name, - images, - kernel_shape, - is_training, - drop_rate=0.0): - with tf.name_scope(name) as scope: - kernel = tf.Variable( - tf.truncated_normal( - kernel_shape, dtype=tf.float32, stddev=1e-1), - name='weights') - conv = tf.nn.conv2d( - images, - kernel, [1, 1, 1, 1], - data_format=args.data_format, - padding='SAME') - biases = tf.Variable( - tf.constant( - 0.0, shape=[kernel_shape[-1]], dtype=tf.float32), - trainable=True, - name='biases') - out = tf.nn.bias_add(conv, biases) - out = self.batch_norm_relu(out, is_training) - out = tf.layers.dropout(out, rate=drop_rate, training=is_training) - return out - - def fc_layer(self, name, inputs, shape): - with tf.name_scope(name) as scope: - fc_w = tf.Variable( - tf.truncated_normal( - shape, dtype=tf.float32, stddev=1e-1), - name='weights') - fc_b = tf.Variable( - tf.constant( - 0.0, shape=[shape[-1]], dtype=tf.float32), - trainable=True, - name='biases') - out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b) - return out - - def network(self, images, class_dim, is_training): - """ VGG16 model structure. - - TODO(kuke): enable this network to support the 'NCHW' data format - """ - - # conv1 - conv1_1 = self.conv_bn_layer( - 'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3) - conv1_2 = self.conv_bn_layer( - 'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0) - # pool1 - pool1 = tf.nn.max_pool( - conv1_2, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME', - name='pool1') - # conv2 - conv2_1 = self.conv_bn_layer( - 'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4) - conv2_2 = self.conv_bn_layer( - 'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0) - # pool2 - pool2 = tf.nn.max_pool( - conv2_2, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME', - name='pool2') - # conv3 - conv3_1 = self.conv_bn_layer( - 'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4) - conv3_2 = self.conv_bn_layer( - 'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4) - conv3_3 = self.conv_bn_layer( - 'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0) - # pool3 - pool3 = tf.nn.max_pool( - conv3_3, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME', - name='pool3') - # conv4 - conv4_1 = self.conv_bn_layer( - 'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4) - conv4_2 = self.conv_bn_layer( - 'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4) - conv4_3 = self.conv_bn_layer( - 'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0) - # pool4 - pool4 = tf.nn.max_pool( - conv4_3, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME', - name='pool4') - # conv5 - conv5_1 = self.conv_bn_layer( - 'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4) - conv5_2 = self.conv_bn_layer( - 'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4) - conv5_3 = self.conv_bn_layer( - 'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0) - # pool5 - pool5 = tf.nn.max_pool( - conv5_3, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME', - name='pool4') - # flatten - shape = int(np.prod(pool5.get_shape()[1:])) - pool5_flat = tf.reshape(pool5, [-1, shape]) - # fc1 - drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training) - fc1 = self.fc_layer('fc1', drop, [shape, 512]) - # fc2 - bn = self.batch_norm_relu(fc1, is_training) - drop = tf.layers.dropout(bn, rate=0.5, training=is_training) - fc2 = self.fc_layer('fc2', drop, [512, 512]) - - fc3 = self.fc_layer('fc3', fc2, [512, class_dim]) - - return fc3 - - -def run_benchmark(cluster_spec, server): - """Run benchmark on cifar10 or flowers.""" - - if args.data_set == "cifar10": - class_dim = 10 - raw_shape = (3, 32, 32) - dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else ( - None, 3, 32, 32) - else: - class_dim = 102 - raw_shape = (3, 224, 224) - dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else ( - None, 3, 224, 224) - - device = tf.train.replica_device_setter( - worker_device="/job:worker/task:{}".format(args.task_index), - cluster=cluster_spec) - - with tf.device(device): - images = tf.placeholder(tf.float32, shape=dat_shape) - labels = tf.placeholder(tf.int64, shape=(None, )) - is_training = tf.placeholder('bool') - onehot_labels = tf.one_hot(labels, depth=class_dim) - - vgg16 = VGG16Model() - logits = vgg16.network(images, class_dim, is_training) - loss = tf.losses.softmax_cross_entropy( - onehot_labels=onehot_labels, logits=logits) - avg_loss = tf.reduce_mean(loss) - - correct = tf.equal(tf.argmax(logits, 1), labels) - accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) - - optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) - update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - global_step = tf.Variable(0, name='global_step', trainable=False) - with tf.control_dependencies(update_ops): - train_op = optimizer.minimize(avg_loss, global_step=global_step) - - summary_op = tf.summary.merge_all() - init_op = tf.global_variables_initializer() - - # data reader - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.test10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), - buf_size=5120), - batch_size=args.batch_size) - - # test - def test(): - test_accs = [] - for batch_id, data in enumerate(test_reader()): - test_images = np.array( - map(lambda x: np.transpose(x[0].reshape(raw_shape), - axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32") - test_labels = np.array(map(lambda x: x[1], data)).astype('int64') - test_accs.append( - accuracy.eval(feed_dict={ - images: test_images, - labels: test_labels, - is_training: False - })) - return np.mean(test_accs) - - config = tf.ConfigProto( - intra_op_parallelism_threads=1, - inter_op_parallelism_threads=1, - log_device_placement=True) - config.gpu_options.allow_growth = True - - hooks = [tf.train.StopAtStepHook(last_step=1000000)] - - with tf.train.MonitoredTrainingSession( - master=server.target, - is_chief=(args.task_index == 0), - hooks=hooks, - config=config) as sess: - iters, num_samples, start_time = 0, 0, 0.0 - for pass_id in range(args.num_passes): - # train - num_samples = 0 - start_time = time.time() - for batch_id, data in enumerate(train_reader()): - train_images = np.array( - map(lambda x: np.transpose(x[0].reshape(raw_shape), - axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32") - train_labels = np.array(map(lambda x: x[1], data)).astype( - 'int64') - iter_begin_time = time.time() - _, loss, acc = sess.run([train_op, avg_loss, accuracy], - feed_dict={ - images: train_images, - labels: train_labels, - is_training: True - }) - iters += 1 - print( - "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec" - % (pass_id, iters, loss, acc, - len(data) / (time.time() - iter_begin_time))) - num_samples += len(data) - train_elapsed = time.time() - start_time - # test - pass_test_acc = test() - print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" % - (pass_id, num_samples / train_elapsed, pass_test_acc)) - - -def print_arguments(): - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == '__main__': - print_arguments() - - ps_hosts = args.ps_hosts.split(",") - worker_hosts = args.worker_hosts.split(",") - - # Create a cluster from the parameter server and worker hosts. - cluster_spec = tf.train.ClusterSpec({ - "ps": ps_hosts, - "worker": worker_hosts - }) - - # Create and start a server for the local task. - server = tf.train.Server( - cluster_spec, job_name=args.job_name, task_index=args.task_index) - - if args.job_name == "ps": - print("start pserver") - server.join() - elif args.job_name == "worker": - print("start worker") - run_benchmark(cluster_spec, server) diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py deleted file mode 100644 index 1a66af32d7131997c63bd3c3042875f33a467084..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/vgg16_v2.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. - -import gzip - -import paddle.v2.dataset.cifar as cifar -import paddle.v2 as paddle -import time -import os - -DATA_DIM = 3 * 32 * 32 -CLASS_DIM = 10 -BATCH_SIZE = os.getenv("BATCH_SIZE") -if BATCH_SIZE: - BATCH_SIZE = int(BATCH_SIZE) -else: - BATCH_SIZE = 128 -print "batch_size", BATCH_SIZE -NODE_COUNT = int(os.getenv("TRAINERS")) -ts = 0 - - -def vgg(input, nums, class_dim): - def conv_block(input, num_filter, groups, num_channels=None): - return paddle.networks.img_conv_group( - input=input, - num_channels=num_channels, - pool_size=2, - pool_stride=2, - conv_num_filter=[num_filter] * groups, - conv_filter_size=3, - conv_act=paddle.activation.Relu(), - pool_type=paddle.pooling.Max()) - - assert len(nums) == 5 - # the channel of input feature is 3 - conv1 = conv_block(input, 64, nums[0], 3) - conv2 = conv_block(conv1, 128, nums[1]) - conv3 = conv_block(conv2, 256, nums[2]) - conv4 = conv_block(conv3, 512, nums[3]) - conv5 = conv_block(conv4, 512, nums[4]) - - fc_dim = 512 - fc1 = paddle.layer.fc(input=conv5, - size=fc_dim, - act=paddle.activation.Relu(), - layer_attr=paddle.attr.Extra(drop_rate=0.5)) - fc2 = paddle.layer.fc(input=fc1, - size=fc_dim, - act=paddle.activation.Relu(), - layer_attr=paddle.attr.Extra(drop_rate=0.5)) - out = paddle.layer.fc(input=fc2, - size=class_dim, - act=paddle.activation.Softmax()) - return out - - -def vgg13(input, class_dim): - nums = [2, 2, 2, 2, 2] - return vgg(input, nums, class_dim) - - -def vgg16(input, class_dim): - nums = [2, 2, 3, 3, 3] - return vgg(input, nums, class_dim) - - -def vgg19(input, class_dim): - nums = [2, 2, 4, 4, 4] - return vgg(input, nums, class_dim) - - -def main(): - global ts - paddle.init(use_gpu=False) - image = paddle.layer.data( - name="image", type=paddle.data_type.dense_vector(DATA_DIM)) - lbl = paddle.layer.data( - name="label", type=paddle.data_type.integer_value(CLASS_DIM)) - - extra_layers = None - # NOTE: for v2 distributed training need averaging updates. - learning_rate = 1e-3 / NODE_COUNT - out = vgg16(image, class_dim=CLASS_DIM) - cost = paddle.layer.classification_cost(input=out, label=lbl) - - # Create parameters - parameters = paddle.parameters.create(cost) - - # Create optimizer - optimizer = paddle.optimizer.Momentum( - momentum=0.9, - regularization=paddle.optimizer.L2Regularization(rate=0.0005 * - BATCH_SIZE), - learning_rate=learning_rate / BATCH_SIZE, - learning_rate_decay_a=0.1, - learning_rate_decay_b=128000 * 35, - learning_rate_schedule="discexp", ) - - train_reader = paddle.batch( - paddle.reader.shuffle( - cifar.train10(), - # To use other data, replace the above line with: - # reader.train_reader('train.list'), - buf_size=1000), - batch_size=BATCH_SIZE) - test_reader = paddle.batch( - cifar.test10(), - # To use other data, replace the above line with: - # reader.test_reader('val.list'), - batch_size=BATCH_SIZE) - - # Create trainer - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=optimizer, - extra_layers=extra_layers, - is_local=False) - - # End batch and end pass event handler - def event_handler(event): - global ts, ts_pass - if isinstance(event, paddle.event.BeginPass): - ts_pass = time.time() - if isinstance(event, paddle.event.BeginIteration): - ts = time.time() - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 1 == 0: - print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % ( - event.pass_id, event.batch_id, event.cost, event.metrics, - time.time() - ts) - if isinstance(event, paddle.event.EndPass): - print "Pass %d end, spent: %f" % (event.pass_id, - time.time() - ts_pass) - result = trainer.test(reader=test_reader) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) - - trainer.train( - reader=train_reader, num_passes=200, event_handler=event_handler) - - -if __name__ == '__main__': - main() diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 1d8f27440d0f1438e0520684ee3e90e8a5891a17..30b070e4acac60caa97a4e8ffd07462cb347ee93 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -94,6 +94,10 @@ def parse_args(): '--memory_optimize', action='store_true', help='If set, optimize runtime memory before start.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='If set ommit the actual read data operators.') parser.add_argument( '--update_method', type=str, @@ -198,6 +202,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, exe.run(train_prog) return + if args.use_fake_data: + raise Exception( + "fake data is not supported in single GPU test for now.") + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = fluid.Executor(place) exe.run(startup_prog) @@ -244,7 +252,31 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, args, train_prog, startup_prog, nccl_id_var, num_trainers, trainer_id): + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + # generate fake: + if args.use_fake_data: + for var in feed_var_list: + v = startup_prog.global_block().clone_variable(var) + var.persistable = True + v.persistable = True + + real_shape = list(var.shape) + real_shape[0] = args.batch_size / args.gpus + startup_prog.global_block().append_op( + outputs={"Out": v}, + type="fill_constant", + attrs={"shape": real_shape, + "value": 1.0, + "dtype": var.dtype}) + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + if nccl_id_var and trainer_id == 0: + #FIXME(wuyi): wait other trainer to start listening + time.sleep(30) + startup_exe = fluid.Executor(place) startup_exe.run(startup_prog) strategy = fluid.ExecutionStrategy() @@ -256,10 +288,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, exec_strategy=strategy, num_trainers=num_trainers, trainer_id=trainer_id) - feed_var_list = [ - var for var in train_prog.global_block().vars.itervalues() - if var.is_data - ] + feeder = fluid.DataFeeder(feed_var_list, place) for pass_id in range(args.pass_num): num_samples = 0 @@ -271,7 +300,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, num_samples = 0 if iters == args.iterations: break - loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) + if args.use_fake_data: + loss, = exe.run([avg_loss.name]) + else: + loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) if args.update_method == "pserver": exe.bcast_params() num_samples += len(data) diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py index 3dbb4b8c5dd13657f8d1853003b321ad047e1349..39ba207fd96f71563504017e77dc0e87c249b3f8 100644 --- a/benchmark/fluid/kube_gen_job.py +++ b/benchmark/fluid/kube_gen_job.py @@ -112,6 +112,7 @@ def gen_job(): envs.append({"name": "PSERVERS", "value": str(args.pservers)}) envs.append({"name": "ENTRY", "value": args.entry}) envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)}) + envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)}) # NOTE: these directories below are cluster specific, please modify # this settings before you run on your own cluster. envs.append({ diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py index b64a7f78ff10d03987ea4a8c13a0e34bb433f64c..2d09d940a5ee638e4b55405d05924e2d76006cfc 100644 --- a/benchmark/fluid/kube_templates/__init__.py +++ b/benchmark/fluid/kube_templates/__init__.py @@ -54,5 +54,13 @@ envs = [ "fieldPath": "status.podIP" } } + }, + { + "name": "PADDLE_CURRENT_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + } } ]