diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
deleted file mode 100644
index 64816098a524f064ec12474a736cd4c721227a70..0000000000000000000000000000000000000000
--- a/benchmark/cluster/README.md
+++ /dev/null
@@ -1,196 +0,0 @@
-# Cluster Training Benchmark
-
-## Setup
-
-- Platform
- - Kubernetes: v1.6.2
- - Linux Kernel: v3.10.0
-
-- Resource
- - CPU: 10 Cores per Pod
- - Memory: 5GB per Pod
-
-- Docker Image
-
- We use different base Docker Image to run the benchmark on Kubernetes:
- - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
- - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
- - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
-
-- Model
- vgg16 is used in this benchmark.
-
-## Cases
-
-- Variable
- - Batch Size of training data.
- - PServer count of the training job.
- - The number of trainers.
-
-- Invariant
- - The resource of trainer/pserver Pod.
-
-### Measure the Performance for Different Batch Size
-
-- PServer Count: 40
-- Trainer Count: 100
-- Metrics: mini-batch / sec
-
-
-
-
-
-Batch Size |
- 32 |
-64 |
-128 |
-256 |
-
-
-
-
- PaddlePaddle Fluid |
-- |
-- |
-- |
-- |
-
-
-PaddlePaddle v2 |
-- |
-- |
-- |
-- |
-
-
-TensorFlow |
-- |
-- |
-- |
-- |
-
-
-
-
-### Measure the Performance for Different PServer Count
-
-- Trainer Count: 100
-- Batch Size: 64
-- Metrics: mini-batch / sec
-
-
-
-
-
-PServer Count |
-10 |
-20 |
-40 |
-60 |
-
-
-
-
- PaddlePaddle Fluid |
-- |
-- |
-- |
-- |
-
-
-PaddlePaddle v2 |
-- |
-- |
-- |
-- |
-
-
-TensorFlow |
-- |
-- |
-- |
-- |
-
-
-
-
-### Measure Parallel Efficiency By Increasing Trainer Count
-
-- PServer Count: 20
-- Batch Size: 64
-- Metrics:
-
-$S = \div(T1, TN)$
-
-which S is the ratio of T1 over TN, training time of 1 and N trainers.
-The parallel efficiency is:
-
-$E = \div(S, N)$
-
-
-
-
-Trainer Counter |
-1 |
-10 |
-20 |
-30 |
-40 |
-50 |
-60 |
-70 |
-80 |
-90 |
-100 |
-
-
-
-
- PaddlePaddle Fluid |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-
-
-PaddlePaddle v2 |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-
-
-TensorFlow |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-
-
-
-
-
-## Reproduce the benchmark
-
-TODO
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
deleted file mode 100644
index 13ad8e1b6237e6f41a076c4fb54311728832ae33..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-
-# you can get mirror list here:
-# https://launchpad.net/ubuntu/+archivemirrors
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
-RUN pip install -U kubernetes opencv-python
-
-RUN pip install paddlepaddle
-# if network is slowly, you may need to add proxy here.
-# ENV https_proxy=
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
-RUN pip uninstall -y paddlepaddle
-# unset proxy if it is setted.
-# ENV https_proxy=""
-
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-# so we must build one with distribute support to install in this image.
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
-ENV LD_LIBRARY_PATH=/usr/local/lib
-
-# tf k8s
-RUN pip install tensorflow==1.4.0
-ADD tf_k8s /usr/bin
-RUN chmod +x /usr/bin/tf_k8s
-ADD vgg16_tf.py /workspace/
-
-# below lines may change a lot for debugging
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN chmod +x /usr/bin/paddle_k8s
-ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
deleted file mode 100644
index d56a912b9b03986e32693363f82df05a34b779e9..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/README.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Performance for Distributed vgg16
-
-## Test Result
-
-### Hardware Infomation
-
-- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
-- cpu MHz : 2101.000
-- cache size : 20480 KB
-
-### Blas settings
-
-Setting environment variable: `MKL_NUM_THREADS=1`.
-
-### Single Node Single Thread
-
-- Metrics: samples / sec
-
-
-
-
-Batch Size |
- 32 |
-64 |
-128 |
-256 |
-
-
-
-
- PaddlePaddle Fluid |
- 15.44 |
- 16.32 |
- 16.74 |
- 16.79 |
-
-
-PaddlePaddle v2 |
- 15.97 |
- 17.04 |
- 17.60 |
- 17.83 |
-
-
-TensorFlow |
- 9.09 |
- 9.10 |
- 9.24 |
- 8.66 |
-
-
-
-
-
-### Different Batch Size
-
-- PServer Count: 10
-- Trainer Count: 20
-- Metrics: samples / sec
-
-
-
-
-Batch Size |
- 32 |
-64 |
-128 |
-256 |
-
-
-
-
- PaddlePaddle Fluid |
- 190.20 |
- 222.15 |
- 247.40 |
- 258.18 |
-
-
-PaddlePaddle v2 |
- 170.96 |
- 233.71 |
- 256.14 |
- 329.23 |
-
-
-TensorFlow |
- - |
- - |
- - |
- - |
-
-
-
-
-### Accelerate Rate
-
-- Pserver Count: 20
-- Batch Size: 128
-- Metrics: samples / sec
-
-
-
-
-Trainer Count |
-20 |
-40 |
-80 |
-100 |
-
-
-
-
- PaddlePaddle Fluid |
- 263.29 (78.64%) |
- 518.80 (77.47%) |
- 836.26 (62.44%) |
- 1019.29 (60.89%) |
-
-
-PaddlePaddle v2 (need more tests) |
- 326.85 (92.85%) |
- 534.58 (75.93%) |
- 853.30 (60.60%) |
- 1041.99 (59.20%) |
-
-
-TensorFlow |
- - |
- - |
- - |
- - |
-
-
-
-
-
-### Different Pserver Count
-
-- Trainer Count: 60
-- Batch Size: 128
-- Metrics: samples/ sec
-
-
-
-
-PServer Count |
-3 |
-6 |
-10 |
-20 |
-
-
-
-
- PaddlePaddle Fluid(should fix in next PR) |
- 589.1 |
- 592.6 |
- 656.4 |
- 655.8 |
-
-
-PaddlePaddle v2 (need more tests) |
- 593.4 |
- 791.3 |
- 729.7 |
- 821.7 |
-
-
-TensorFlow |
- - |
- - |
- - |
- - |
-
-
-
-
-
-*The performance gap between Fuild and v2 comes from the network interference.*
-
-
-## Steps to Run the Performance Test
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable Verbos Logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
deleted file mode 100644
index ee8b0763b62fc011f40f6197e929a68b48a93e47..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
- name: vgg16job-pserver
-spec:
- replicas: 10
- template:
- metadata:
- labels:
- paddle-job-pserver: vgg16job
- spec:
- hostNetwork: true
- imagePullSecrets:
- - name: job-registry-secret
- containers:
- - name: pserver
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16job
- - name: MKL_NUM_THREADS
- value: "1"
- - name: TRAINING_ROLE
- value: "PSERVER"
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "1"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- command: ["paddle_k8s", "start_fluid"]
- resources:
- requests:
- memory: 10Gi
- cpu: 4
- limits:
- memory: 10Gi
- cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
deleted file mode 100644
index 3d56caac009464d1073423bb63abff1f8b0cf28f..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: vgg16job-trainer
-spec:
- parallelism: 20
- completions: 20
- template:
- metadata:
- labels:
- paddle-job: vgg16job
- spec:
- imagePullSecrets:
- - name: job-registry-secret
- hostNetwork: true
- containers:
- - name: trainer
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- command: ["paddle_k8s", "start_fluid"]
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16job
- - name: TRAINING_ROLE
- value: "TRAINER"
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "1"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- resources:
- requests:
- memory: 40Gi
- cpu: 2
- limits:
- memory: 40Gi
- cpu: 2
- restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/run_vgg_dist.sh b/benchmark/cluster/vgg16/run_vgg_dist.sh
deleted file mode 100644
index 8c0501439e9d5fa175f5aa9b62d286e690a10904..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# Update to point to the source file.
-VGG_SRC="vgg16_fluid.py"
-
-export TRAINING_ROLE=PSERVER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
-
-# Need to wait for the ps to start first.
-sleep 10
-echo "done start ps"
-
-export TRAINING_ROLE=TRAINER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
-CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s
deleted file mode 100644
index 4fc263d5f681aeabfa71f1758714d269d987b272..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_k8s
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-check_trainer_ret() {
- ret=$1
- stdbuf -oL echo "job returned $ret...setting pod return message..."
- stdbuf -oL echo "==============================="
-
- if [ $ret -eq 136 ] ; then
- echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
- elif [ $ret -eq 139 ] ; then
- echo "Segmentation Fault" > /dev/termination-log
- elif [ $ret -eq 1 ] ; then
- echo "General Error" > /dev/termination-log
- elif [ $ret -eq 134 ] ; then
- echo "Program Abort" > /dev/termination-log
- fi
- stdbuf -oL echo "termination log wroted..."
- exit $ret
-}
-
-g_pservers=""
-g_trainers=""
-
-wait_running_pods(){
- pserver_label="tf-job-pserver=${JOB_NAME}"
- trainer_label="tf-job-trainer=${JOB_NAME}"
-
- stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
- stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
-
- g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
- g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
-}
-
-start_tf_pserver(){
- wait_running_pods
-
- label="tf-job-pserver=${JOB_NAME}"
- pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
-
- cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
- --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
-
- stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-}
-
-start_tf_trainer(){
- wait_running_pods
-
- label="tf-job-trainer=${JOB_NAME}"
- trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
-
- cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
- --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
-
- stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
- check_trainer_ret $?
-}
-
-start_tf(){
- if [[ "${TF_JOB_NAME}" == "worker" ]]; then
- start_tf_trainer
- else
- start_tf_pserver
- fi
-}
-
-usage() {
- echo "usage: tf_k8s []:"
- echo " start_tf Start tensorflow jobs"
-}
-
-case "$1" in
- start_tf)
- start_tf
- ;;
- --help)
- usage
- ;;
- *)
- usage
- ;;
-esac
diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml
deleted file mode 100644
index 5e37c700819119c8af05c40fe4b8d13911efc3e1..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
- name: vgg16job-tf-pserver
-spec:
- replicas: 10
- template:
- metadata:
- labels:
- tf-job-pserver: vgg16job-tf
- spec:
- hostNetwork: true
- imagePullSecrets:
- - name: job-registry-secret
- containers:
- - name: pserver
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
- imagePullPolicy: Always
- command: ["tf_k8s", "start_tf"]
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PORT
- value: "32036"
- - name: ENTRY
- value: "python vgg16_tf.py"
- - name: JOB_NAME
- value: vgg16job-tf
- - name: PSERVERS_NUM
- value: "10"
- - name: TF_JOB_NAME
- value: "ps"
- - name: TRAINERS_NUM
- value: "20"
- - name: BATCH_SIZE
- value: "128"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: NUM_PASSES
- value: "1"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- resources:
- requests:
- memory: 10Gi
- cpu: 4
- limits:
- memory: 10Gi
- cpu: 4
diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml
deleted file mode 100644
index 08795df3addfa7b618db24a65e57be190e268f06..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: vgg16job-tf-trainer
-spec:
- parallelism: 20
- completions: 20
- template:
- metadata:
- labels:
- tf-job-trainer: vgg16job-tf
- spec:
- imagePullSecrets:
- - name: job-registry-secret
- hostNetwork: true
- containers:
- - name: trainer
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
- imagePullPolicy: Always
- command: ["tf_k8s", "start_tf"]
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PORT
- value: "32036"
- - name: JOB_NAME
- value: vgg16job-tf
- - name: TF_JOB_NAME
- value: "worker"
- - name: ENTRY
- value: "python vgg16_tf.py"
- - name: PSERVERS_NUM
- value: "10"
- - name: BATCH_SIZE
- value: "128"
- - name: TRAINERS_NUM
- value: "20"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: NUM_PASSES
- value: "1"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- resources:
- requests:
- memory: 40Gi
- cpu: 2
- limits:
- memory: 40Gi
- cpu: 2
- restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
deleted file mode 100644
index dd1271e0cf399184134c06b3200ee1202c65cef0..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
- name: vgg16v2job-pserver
-spec:
- replicas: 10
- template:
- metadata:
- labels:
- paddle-job-pserver: vgg16v2job
- spec:
- hostNetwork: true
- imagePullSecrets:
- - name: job-registry-secret
- containers:
- - name: pserver
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16v2job
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "python train.py"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "1"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- command: ["paddle_k8s", "start_pserver"]
- resources:
- requests:
- memory: 10Gi
- cpu: 4
- limits:
- memory: 10Gi
- cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
deleted file mode 100644
index 12c8964066cbcfe8d2a44de2f51a3d12ea422fe2..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: vgg16v2job-trainer
-spec:
- parallelism: 20
- completions: 20
- template:
- metadata:
- labels:
- paddle-job: vgg16v2job
- spec:
- imagePullSecrets:
- - name: job-registry-secret
- hostNetwork: true
- containers:
- - name: trainer
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- command: ["paddle_k8s", "start_trainer", "v2"]
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16v2job
- - name: BATCH_SIZE
- value: "256"
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "2"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- resources:
- requests:
- memory: 40Gi
- cpu: 2
- limits:
- memory: 40Gi
- cpu: 2
- restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
deleted file mode 100644
index e9360ab4c79d23bdf9f84d0c0d407af6d39bde3e..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-import argparse
-import functools
-import os
-from paddle.fluid import debuger
-
-
-def str2bool(v):
- if v.lower() in ('yes', 'true', 't', 'y', '1'):
- return True
- elif v.lower() in ('no', 'false', 'f', 'n', '0'):
- return False
- else:
- raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- '--batch_size', type=int, default=16, help="Batch size for training.")
-parser.add_argument(
- '--learning_rate',
- type=float,
- default=1e-3,
- help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
- '--device',
- type=str,
- default='CPU',
- choices=['CPU', 'GPU'],
- help="The device type.")
-parser.add_argument('--device_id', type=int, default=0, help="The device id.")
-parser.add_argument(
- '--data_format',
- type=str,
- default='NCHW',
- choices=['NCHW', 'NHWC'],
- help='The data order, now only support NCHW.')
-parser.add_argument(
- '--data_set',
- type=str,
- default='flowers',
- choices=['cifar10', 'flowers'],
- help='Optional dataset for benchmark.')
-parser.add_argument(
- '--local',
- type=str2bool,
- default=True,
- help='Whether to run as local mode.')
-
-parser.add_argument(
- "--ps_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--trainer_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--profile", action='store_true', help="If set, profile a few steps.")
-
-# Flags for defining the tf.train.Server
-parser.add_argument(
- "--task_index", type=int, default=0, help="Index of task within the job")
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
- def conv_block(input, num_filter, groups, dropouts):
- return fluid.nets.img_conv_group(
- input=input,
- pool_size=2,
- pool_stride=2,
- conv_num_filter=[num_filter] * groups,
- conv_filter_size=3,
- conv_act='relu',
- conv_with_batchnorm=True,
- conv_batchnorm_drop_rate=dropouts,
- pool_type='max')
-
- conv1 = conv_block(input, 64, 2, [0.3, 0])
- conv2 = conv_block(conv1, 128, 2, [0.4, 0])
- conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
- conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
- conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
- drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
- fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
- bn = fluid.layers.batch_norm(input=fc1, act='relu')
- drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
- fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
- return fc2
-
-
-def main():
- if args.data_set == "cifar10":
- classdim = 10
- if args.data_format == 'NCHW':
- data_shape = [3, 32, 32]
- else:
- data_shape = [32, 32, 3]
- else:
- classdim = 102
- if args.data_format == 'NCHW':
- data_shape = [3, 224, 224]
- else:
- data_shape = [224, 224, 3]
-
- # Input data
- images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
- label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
- # Train program
- net = vgg16_bn_drop(images)
- predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
- cost = fluid.layers.cross_entropy(input=predict, label=label)
- avg_cost = fluid.layers.mean(x=cost)
-
- # Evaluator
- batch_size = fluid.layers.create_tensor(dtype='int64')
- batch_acc = fluid.layers.accuracy(
- input=predict, label=label, total=batch_size)
-
- # inference program
- inference_program = fluid.default_main_program().clone()
- with fluid.program_guard(inference_program):
- inference_program = fluid.io.get_inference_program(batch_acc)
-
- # Optimization
- optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
- optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
- # Initialize executor
- place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
- args.device_id)
- exe = fluid.Executor(place)
-
- # test
- def test(exe):
- test_pass_acc = fluid.average.WeightedAverage()
- for batch_id, data in enumerate(test_reader()):
- img_data = np.array(map(lambda x: x[0].reshape(data_shape),
- data)).astype("float32")
- y_data = np.array(map(lambda x: x[1], data)).astype("int64")
- y_data = y_data.reshape([-1, 1])
-
- outs = exe.run(inference_program,
- feed={"pixel": img_data,
- "label": y_data},
- fetch_list=[batch_acc, batch_size])
- test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-
- return test_pass_acc.eval()
-
- def train_loop(exe, trainer_prog):
- iters = 0
- ts = time.time()
- train_pass_acc = fluid.average.WeightedAverage()
- for pass_id in range(args.num_passes):
- # train
- start_time = time.time()
- num_samples = 0
- train_pass_acc.reset()
-
- def run_step(batch_id, data):
- img_data = np.array(
- map(lambda x: x[0].reshape(data_shape), data)).astype(
- "float32")
- y_data = np.array(map(lambda x: x[1], data)).astype("int64")
- y_data = y_data.reshape([-1, 1])
-
- loss, acc, b_size = exe.run(
- trainer_prog,
- feed={"pixel": img_data,
- "label": y_data},
- fetch_list=[avg_cost, batch_acc, batch_size])
- return loss, acc, b_size
-
- if args.profile:
- with profiler.profiler('All', 'total',
- '/tmp/profile_vgg_%d' % args.task_index):
- for batch_id, data in enumerate(train_reader()):
- if batch_id > 5: break
- run_step(batch_id, data)
-
- total_time = 0.0
- count = 0
- for batch_id, data in enumerate(train_reader()):
- ts = time.time()
- loss, acc, b_size = run_step(batch_id, data)
- iters += 1
- num_samples += len(data)
- train_pass_acc.add(value=acc, weight=b_size)
-
- duration = time.time() - ts
- total_time += duration
- count += len(data)
- print(
- "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
- "Speed = %.2f (%.2f) img/s" % (pass_id, iters, loss, acc,
- len(data) / duration,
- count / total_time)
- ) # The accuracy is the accumulation of batches, but not the current batch.
-
- pass_elapsed = time.time() - start_time
- pass_train_acc = train_pass_acc.eval()
- pass_test_acc = test(exe)
- print("Task:%d Pass = %d, Training performance = %f imgs/s, "
- "Train accuracy = %f, Test accuracy = %f\n" %
- (args.task_index, pass_id, num_samples / pass_elapsed,
- pass_train_acc, pass_test_acc))
-
- if args.local:
- # Parameter initialization
- exe.run(fluid.default_startup_program())
-
- # data reader
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
- else paddle.dataset.flowers.train(),
- buf_size=5120),
- batch_size=args.batch_size)
- test_reader = paddle.batch(
- paddle.dataset.cifar.test10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
- batch_size=args.batch_size)
- train_loop(exe, fluid.default_main_program())
- else:
- trainers = int(os.getenv("TRAINERS")) # total trainer count
- print("trainers total: ", trainers)
-
- training_role = os.getenv(
- "TRAINING_ROLE",
- "TRAINER") # get the training role: trainer/pserver
-
- t = fluid.DistributeTranspiler()
- t.transpile(
- trainer_id=args.task_index,
- pservers=args.ps_hosts,
- trainers=trainers)
-
- if training_role == "PSERVER":
- current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
- "PADDLE_INIT_PORT")
- if not current_endpoint:
- print("need env SERVER_ENDPOINT")
- exit(1)
- pserver_prog = t.get_pserver_program(current_endpoint)
- pserver_startup = t.get_startup_program(current_endpoint,
- pserver_prog)
- exe.run(pserver_startup)
- exe.run(pserver_prog)
- elif training_role == "TRAINER":
- # Parameter initialization
- exe.run(fluid.default_startup_program())
-
- # data reader
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
- else paddle.dataset.flowers.train(),
- buf_size=5120),
- batch_size=args.batch_size)
- test_reader = paddle.batch(
- paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
- paddle.dataset.flowers.test(),
- batch_size=args.batch_size)
-
- trainer_prog = t.get_trainer_program()
- feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
- # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
- exe.run(fluid.default_startup_program())
- train_loop(exe, trainer_prog)
- else:
- print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-def print_arguments():
- print('----------- Configuration Arguments -----------')
- for arg, value in sorted(vars(args).iteritems()):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-if __name__ == "__main__":
- print_arguments()
- main()
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
deleted file mode 100644
index 2d220478acae46566760209dbc012cff316946aa..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ /dev/null
@@ -1,366 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in TensorFlow
-You can get distribution example template structure here:
-https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
-https://www.tensorflow.org/deploy/distributed
-"""
-
-import tensorflow as tf
-import paddle.v2 as paddle
-import numpy as np
-import argparse
-import time
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
- '--learning_rate',
- type=float,
- default=1e-3,
- help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
- '--device',
- type=str,
- default='CPU',
- choices=['CPU', 'GPU'],
- help="The device type.")
-parser.add_argument(
- '--data_format',
- type=str,
- default='NHWC',
- choices=['NCHW', 'NHWC'],
- help='The data order, NCHW=[batch, channels, height, width].'
- 'Only support NHWC right now.')
-parser.add_argument(
- '--data_set',
- type=str,
- default='cifar10',
- choices=['cifar10', 'flowers'],
- help='Optional dataset for benchmark.')
-
-parser.add_argument(
- "--ps_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--worker_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--job_name", type=str, default="", help="One of 'worker', 'ps'")
-# Flags for defining the tf.train.Server
-parser.add_argument(
- "--task_index", type=int, default=0, help="Index of task within the job")
-
-args = parser.parse_args()
-
-
-class VGG16Model(object):
- def __init__(self):
- self.parameters = []
-
- def batch_norm_relu(self, inputs, is_training):
- """Performs a batch normalization followed by a ReLU."""
- # We set fused=True for a significant speed boost. See
- # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
- inputs = tf.layers.batch_normalization(
- inputs=inputs,
- axis=1 if args.data_format == 'NCHW' else -1,
- momentum=0.9,
- epsilon=1e-05,
- center=True,
- scale=True,
- training=is_training,
- fused=True)
- inputs = tf.nn.relu(inputs)
- return inputs
-
- def conv_bn_layer(self,
- name,
- images,
- kernel_shape,
- is_training,
- drop_rate=0.0):
- with tf.name_scope(name) as scope:
- kernel = tf.Variable(
- tf.truncated_normal(
- kernel_shape, dtype=tf.float32, stddev=1e-1),
- name='weights')
- conv = tf.nn.conv2d(
- images,
- kernel, [1, 1, 1, 1],
- data_format=args.data_format,
- padding='SAME')
- biases = tf.Variable(
- tf.constant(
- 0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
- trainable=True,
- name='biases')
- out = tf.nn.bias_add(conv, biases)
- out = self.batch_norm_relu(out, is_training)
- out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
- return out
-
- def fc_layer(self, name, inputs, shape):
- with tf.name_scope(name) as scope:
- fc_w = tf.Variable(
- tf.truncated_normal(
- shape, dtype=tf.float32, stddev=1e-1),
- name='weights')
- fc_b = tf.Variable(
- tf.constant(
- 0.0, shape=[shape[-1]], dtype=tf.float32),
- trainable=True,
- name='biases')
- out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
- return out
-
- def network(self, images, class_dim, is_training):
- """ VGG16 model structure.
-
- TODO(kuke): enable this network to support the 'NCHW' data format
- """
-
- # conv1
- conv1_1 = self.conv_bn_layer(
- 'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
- conv1_2 = self.conv_bn_layer(
- 'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
- # pool1
- pool1 = tf.nn.max_pool(
- conv1_2,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool1')
- # conv2
- conv2_1 = self.conv_bn_layer(
- 'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
- conv2_2 = self.conv_bn_layer(
- 'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
- # pool2
- pool2 = tf.nn.max_pool(
- conv2_2,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool2')
- # conv3
- conv3_1 = self.conv_bn_layer(
- 'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
- conv3_2 = self.conv_bn_layer(
- 'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
- conv3_3 = self.conv_bn_layer(
- 'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
- # pool3
- pool3 = tf.nn.max_pool(
- conv3_3,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool3')
- # conv4
- conv4_1 = self.conv_bn_layer(
- 'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
- conv4_2 = self.conv_bn_layer(
- 'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
- conv4_3 = self.conv_bn_layer(
- 'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
- # pool4
- pool4 = tf.nn.max_pool(
- conv4_3,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool4')
- # conv5
- conv5_1 = self.conv_bn_layer(
- 'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
- conv5_2 = self.conv_bn_layer(
- 'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
- conv5_3 = self.conv_bn_layer(
- 'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
- # pool5
- pool5 = tf.nn.max_pool(
- conv5_3,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool4')
- # flatten
- shape = int(np.prod(pool5.get_shape()[1:]))
- pool5_flat = tf.reshape(pool5, [-1, shape])
- # fc1
- drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
- fc1 = self.fc_layer('fc1', drop, [shape, 512])
- # fc2
- bn = self.batch_norm_relu(fc1, is_training)
- drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
- fc2 = self.fc_layer('fc2', drop, [512, 512])
-
- fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
-
- return fc3
-
-
-def run_benchmark(cluster_spec, server):
- """Run benchmark on cifar10 or flowers."""
-
- if args.data_set == "cifar10":
- class_dim = 10
- raw_shape = (3, 32, 32)
- dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
- None, 3, 32, 32)
- else:
- class_dim = 102
- raw_shape = (3, 224, 224)
- dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
- None, 3, 224, 224)
-
- device = tf.train.replica_device_setter(
- worker_device="/job:worker/task:{}".format(args.task_index),
- cluster=cluster_spec)
-
- with tf.device(device):
- images = tf.placeholder(tf.float32, shape=dat_shape)
- labels = tf.placeholder(tf.int64, shape=(None, ))
- is_training = tf.placeholder('bool')
- onehot_labels = tf.one_hot(labels, depth=class_dim)
-
- vgg16 = VGG16Model()
- logits = vgg16.network(images, class_dim, is_training)
- loss = tf.losses.softmax_cross_entropy(
- onehot_labels=onehot_labels, logits=logits)
- avg_loss = tf.reduce_mean(loss)
-
- correct = tf.equal(tf.argmax(logits, 1), labels)
- accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
- optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
- update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
- global_step = tf.Variable(0, name='global_step', trainable=False)
- with tf.control_dependencies(update_ops):
- train_op = optimizer.minimize(avg_loss, global_step=global_step)
-
- summary_op = tf.summary.merge_all()
- init_op = tf.global_variables_initializer()
-
- # data reader
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.train10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
- buf_size=5120),
- batch_size=args.batch_size)
- test_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.test10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
- buf_size=5120),
- batch_size=args.batch_size)
-
- # test
- def test():
- test_accs = []
- for batch_id, data in enumerate(test_reader()):
- test_images = np.array(
- map(lambda x: np.transpose(x[0].reshape(raw_shape),
- axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
- test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
- test_accs.append(
- accuracy.eval(feed_dict={
- images: test_images,
- labels: test_labels,
- is_training: False
- }))
- return np.mean(test_accs)
-
- config = tf.ConfigProto(
- intra_op_parallelism_threads=1,
- inter_op_parallelism_threads=1,
- log_device_placement=True)
- config.gpu_options.allow_growth = True
-
- hooks = [tf.train.StopAtStepHook(last_step=1000000)]
-
- with tf.train.MonitoredTrainingSession(
- master=server.target,
- is_chief=(args.task_index == 0),
- hooks=hooks,
- config=config) as sess:
- iters, num_samples, start_time = 0, 0, 0.0
- for pass_id in range(args.num_passes):
- # train
- num_samples = 0
- start_time = time.time()
- for batch_id, data in enumerate(train_reader()):
- train_images = np.array(
- map(lambda x: np.transpose(x[0].reshape(raw_shape),
- axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
- train_labels = np.array(map(lambda x: x[1], data)).astype(
- 'int64')
- iter_begin_time = time.time()
- _, loss, acc = sess.run([train_op, avg_loss, accuracy],
- feed_dict={
- images: train_images,
- labels: train_labels,
- is_training: True
- })
- iters += 1
- print(
- "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
- % (pass_id, iters, loss, acc,
- len(data) / (time.time() - iter_begin_time)))
- num_samples += len(data)
- train_elapsed = time.time() - start_time
- # test
- pass_test_acc = test()
- print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
- (pass_id, num_samples / train_elapsed, pass_test_acc))
-
-
-def print_arguments():
- print('----------- Configuration Arguments -----------')
- for arg, value in sorted(vars(args).iteritems()):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-if __name__ == '__main__':
- print_arguments()
-
- ps_hosts = args.ps_hosts.split(",")
- worker_hosts = args.worker_hosts.split(",")
-
- # Create a cluster from the parameter server and worker hosts.
- cluster_spec = tf.train.ClusterSpec({
- "ps": ps_hosts,
- "worker": worker_hosts
- })
-
- # Create and start a server for the local task.
- server = tf.train.Server(
- cluster_spec, job_name=args.job_name, task_index=args.task_index)
-
- if args.job_name == "ps":
- print("start pserver")
- server.join()
- elif args.job_name == "worker":
- print("start worker")
- run_benchmark(cluster_spec, server)
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
deleted file mode 100644
index 1a66af32d7131997c63bd3c3042875f33a467084..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import gzip
-
-import paddle.v2.dataset.cifar as cifar
-import paddle.v2 as paddle
-import time
-import os
-
-DATA_DIM = 3 * 32 * 32
-CLASS_DIM = 10
-BATCH_SIZE = os.getenv("BATCH_SIZE")
-if BATCH_SIZE:
- BATCH_SIZE = int(BATCH_SIZE)
-else:
- BATCH_SIZE = 128
-print "batch_size", BATCH_SIZE
-NODE_COUNT = int(os.getenv("TRAINERS"))
-ts = 0
-
-
-def vgg(input, nums, class_dim):
- def conv_block(input, num_filter, groups, num_channels=None):
- return paddle.networks.img_conv_group(
- input=input,
- num_channels=num_channels,
- pool_size=2,
- pool_stride=2,
- conv_num_filter=[num_filter] * groups,
- conv_filter_size=3,
- conv_act=paddle.activation.Relu(),
- pool_type=paddle.pooling.Max())
-
- assert len(nums) == 5
- # the channel of input feature is 3
- conv1 = conv_block(input, 64, nums[0], 3)
- conv2 = conv_block(conv1, 128, nums[1])
- conv3 = conv_block(conv2, 256, nums[2])
- conv4 = conv_block(conv3, 512, nums[3])
- conv5 = conv_block(conv4, 512, nums[4])
-
- fc_dim = 512
- fc1 = paddle.layer.fc(input=conv5,
- size=fc_dim,
- act=paddle.activation.Relu(),
- layer_attr=paddle.attr.Extra(drop_rate=0.5))
- fc2 = paddle.layer.fc(input=fc1,
- size=fc_dim,
- act=paddle.activation.Relu(),
- layer_attr=paddle.attr.Extra(drop_rate=0.5))
- out = paddle.layer.fc(input=fc2,
- size=class_dim,
- act=paddle.activation.Softmax())
- return out
-
-
-def vgg13(input, class_dim):
- nums = [2, 2, 2, 2, 2]
- return vgg(input, nums, class_dim)
-
-
-def vgg16(input, class_dim):
- nums = [2, 2, 3, 3, 3]
- return vgg(input, nums, class_dim)
-
-
-def vgg19(input, class_dim):
- nums = [2, 2, 4, 4, 4]
- return vgg(input, nums, class_dim)
-
-
-def main():
- global ts
- paddle.init(use_gpu=False)
- image = paddle.layer.data(
- name="image", type=paddle.data_type.dense_vector(DATA_DIM))
- lbl = paddle.layer.data(
- name="label", type=paddle.data_type.integer_value(CLASS_DIM))
-
- extra_layers = None
- # NOTE: for v2 distributed training need averaging updates.
- learning_rate = 1e-3 / NODE_COUNT
- out = vgg16(image, class_dim=CLASS_DIM)
- cost = paddle.layer.classification_cost(input=out, label=lbl)
-
- # Create parameters
- parameters = paddle.parameters.create(cost)
-
- # Create optimizer
- optimizer = paddle.optimizer.Momentum(
- momentum=0.9,
- regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
- BATCH_SIZE),
- learning_rate=learning_rate / BATCH_SIZE,
- learning_rate_decay_a=0.1,
- learning_rate_decay_b=128000 * 35,
- learning_rate_schedule="discexp", )
-
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- cifar.train10(),
- # To use other data, replace the above line with:
- # reader.train_reader('train.list'),
- buf_size=1000),
- batch_size=BATCH_SIZE)
- test_reader = paddle.batch(
- cifar.test10(),
- # To use other data, replace the above line with:
- # reader.test_reader('val.list'),
- batch_size=BATCH_SIZE)
-
- # Create trainer
- trainer = paddle.trainer.SGD(cost=cost,
- parameters=parameters,
- update_equation=optimizer,
- extra_layers=extra_layers,
- is_local=False)
-
- # End batch and end pass event handler
- def event_handler(event):
- global ts, ts_pass
- if isinstance(event, paddle.event.BeginPass):
- ts_pass = time.time()
- if isinstance(event, paddle.event.BeginIteration):
- ts = time.time()
- if isinstance(event, paddle.event.EndIteration):
- if event.batch_id % 1 == 0:
- print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
- event.pass_id, event.batch_id, event.cost, event.metrics,
- time.time() - ts)
- if isinstance(event, paddle.event.EndPass):
- print "Pass %d end, spent: %f" % (event.pass_id,
- time.time() - ts_pass)
- result = trainer.test(reader=test_reader)
- print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
- trainer.train(
- reader=train_reader, num_passes=200, event_handler=event_handler)
-
-
-if __name__ == '__main__':
- main()
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 1d8f27440d0f1438e0520684ee3e90e8a5891a17..30b070e4acac60caa97a4e8ffd07462cb347ee93 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -94,6 +94,10 @@ def parse_args():
'--memory_optimize',
action='store_true',
help='If set, optimize runtime memory before start.')
+ parser.add_argument(
+ '--use_fake_data',
+ action='store_true',
+ help='If set ommit the actual read data operators.')
parser.add_argument(
'--update_method',
type=str,
@@ -198,6 +202,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
exe.run(train_prog)
return
+ if args.use_fake_data:
+ raise Exception(
+ "fake data is not supported in single GPU test for now.")
+
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_prog)
@@ -244,7 +252,31 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
batch_acc, args, train_prog, startup_prog, nccl_id_var,
num_trainers, trainer_id):
+ feed_var_list = [
+ var for var in train_prog.global_block().vars.itervalues()
+ if var.is_data
+ ]
+ # generate fake:
+ if args.use_fake_data:
+ for var in feed_var_list:
+ v = startup_prog.global_block().clone_variable(var)
+ var.persistable = True
+ v.persistable = True
+
+ real_shape = list(var.shape)
+ real_shape[0] = args.batch_size / args.gpus
+ startup_prog.global_block().append_op(
+ outputs={"Out": v},
+ type="fill_constant",
+ attrs={"shape": real_shape,
+ "value": 1.0,
+ "dtype": var.dtype})
+
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+ if nccl_id_var and trainer_id == 0:
+ #FIXME(wuyi): wait other trainer to start listening
+ time.sleep(30)
+
startup_exe = fluid.Executor(place)
startup_exe.run(startup_prog)
strategy = fluid.ExecutionStrategy()
@@ -256,10 +288,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
exec_strategy=strategy,
num_trainers=num_trainers,
trainer_id=trainer_id)
- feed_var_list = [
- var for var in train_prog.global_block().vars.itervalues()
- if var.is_data
- ]
+
feeder = fluid.DataFeeder(feed_var_list, place)
for pass_id in range(args.pass_num):
num_samples = 0
@@ -271,7 +300,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
num_samples = 0
if iters == args.iterations:
break
- loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+ if args.use_fake_data:
+ loss, = exe.run([avg_loss.name])
+ else:
+ loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver":
exe.bcast_params()
num_samples += len(data)
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
index 3dbb4b8c5dd13657f8d1853003b321ad047e1349..39ba207fd96f71563504017e77dc0e87c249b3f8 100644
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -112,6 +112,7 @@ def gen_job():
envs.append({"name": "PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
+ envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster.
envs.append({
diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py
index b64a7f78ff10d03987ea4a8c13a0e34bb433f64c..2d09d940a5ee638e4b55405d05924e2d76006cfc 100644
--- a/benchmark/fluid/kube_templates/__init__.py
+++ b/benchmark/fluid/kube_templates/__init__.py
@@ -54,5 +54,13 @@ envs = [
"fieldPath": "status.podIP"
}
}
+ },
+ {
+ "name": "PADDLE_CURRENT_IP",
+ "valueFrom": {
+ "fieldRef": {
+ "fieldPath": "status.podIP"
+ }
+ }
}
]