diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
deleted file mode 100644
index 64816098a524f064ec12474a736cd4c721227a70..0000000000000000000000000000000000000000
--- a/benchmark/cluster/README.md
+++ /dev/null
@@ -1,196 +0,0 @@
-# Cluster Training Benchmark
-
-## Setup
-
-- Platform
-  - Kubernetes: v1.6.2
-  - Linux Kernel: v3.10.0
-
-- Resource
-  - CPU: 10 Cores per Pod
-  - Memory: 5GB per Pod
-
-- Docker Image
-
-  We use different base Docker Image to run the benchmark on Kubernetes:
-  - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
-  - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
-  - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
-
-- Model
-  vgg16 is used in this benchmark.
-
-## Cases
-
-- Variable
-  - Batch Size of training data.
-  - PServer count of the training job.
-  - The number of trainers.
-
-- Invariant
-  - The resource of trainer/pserver Pod.
-
-### Measure the Performance for Different Batch Size
-
-- PServer Count: 40
-- Trainer Count: 100
-- Metrics: mini-batch / sec
-
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-### Measure the Performance for Different PServer Count
-
-- Trainer Count: 100
-- Batch Size: 64
-- Metrics: mini-batch / sec
-
-
-<table>
-<thead>
-<tr>
-<th>PServer Count  </th>
-<th>10</th>
-<th>20</th>
-<th>40 </th>
-<th>60</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-### Measure Parallel Efficiency By Increasing Trainer Count
-
-- PServer Count: 20
-- Batch Size: 64
-- Metrics:
-
-$S = \div(T1, TN)$
-
-which S is the ratio of T1 over TN, training time of 1 and N trainers.
-The parallel efficiency is:
-
-$E = \div(S, N)$
-
-<table>
-<thead>
-<tr>
-<th>Trainer Counter  </th>
-<th>1</th>
-<th>10</th>
-<th>20 </th>
-<th>30</th>
-<th>40</th>
-<th>50</th>
-<th>60 </th>
-<th>70</th>
-<th>80</th>
-<th>90</th>
-<th>100 </th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-
-## Reproduce the benchmark
-
-TODO
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
deleted file mode 100644
index 13ad8e1b6237e6f41a076c4fb54311728832ae33..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-
-# you can get mirror list here:
-# https://launchpad.net/ubuntu/+archivemirrors
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
-RUN pip install -U kubernetes opencv-python
-
-RUN pip install paddlepaddle
-# if network is slowly, you may need to add proxy here.
-# ENV https_proxy=
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
-RUN pip uninstall -y paddlepaddle
-# unset proxy if it is setted.
-# ENV https_proxy=""
-
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-#       so we must build one with distribute support to install in this image.
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
-ENV LD_LIBRARY_PATH=/usr/local/lib
-
-# tf k8s
-RUN pip install tensorflow==1.4.0
-ADD tf_k8s /usr/bin
-RUN chmod +x /usr/bin/tf_k8s
-ADD vgg16_tf.py /workspace/
-
-# below lines may change a lot for debugging
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN chmod +x /usr/bin/paddle_k8s
-ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
deleted file mode 100644
index d56a912b9b03986e32693363f82df05a34b779e9..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/README.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Performance for Distributed vgg16
-
-## Test Result
-
-### Hardware Infomation
-
-- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
-- cpu MHz		: 2101.000
-- cache size	: 20480 KB
-
-### Blas settings
-
-Setting environment variable: `MKL_NUM_THREADS=1`.
-
-### Single Node Single Thread
-
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 15.44 </td>
-<td> 16.32 </td>
-<td> 16.74 </td>
-<td> 16.79 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td> 15.97 </td>
-<td> 17.04 </td>
-<td> 17.60 </td>
-<td> 17.83 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> 9.09 </td>
-<td> 9.10 </td>
-<td> 9.24 </td>
-<td> 8.66 </td>
-</tr>
-</tbody>
-</table>
-
-
-### Different Batch Size
-
-- PServer Count: 10
-- Trainer Count: 20
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 190.20 </td>
-<td> 222.15 </td>
-<td> 247.40 </td>
-<td> 258.18 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td> 170.96 </td>
-<td> 233.71 </td>
-<td> 256.14 </td>
-<td> 329.23 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-### Accelerate Rate
-
-- Pserver Count: 20
-- Batch Size: 128
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Trainer Count </th>
-<th>20</th>
-<th>40</th>
-<th>80</th>
-<th>100</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 263.29 (78.64%) </td>
-<td> 518.80 (77.47%) </td>
-<td> 836.26 (62.44%) </td>
-<td> 1019.29 (60.89%) </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2 (need more tests)   </td>
-<td> 326.85 (92.85%) </td>
-<td> 534.58 (75.93%) </td>
-<td> 853.30 (60.60%) </td>
-<td> 1041.99 (59.20%) </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-
-### Different Pserver Count
-
-- Trainer Count: 60
-- Batch Size: 128
-- Metrics: samples/ sec
-
-<table>
-<thead>
-<tr>
-<th>PServer Count </th>
-<th>3</th>
-<th>6</th>
-<th>10</th>
-<th>20</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid(should fix in next PR) </td>
-<td> 589.1 </td>
-<td> 592.6 </td>
-<td> 656.4 </td>
-<td> 655.8 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2 (need more tests)   </td>
-<td> 593.4 </td>
-<td> 791.3 </td>
-<td> 729.7 </td>
-<td> 821.7 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-
-*The performance gap between Fuild and v2 comes from the network interference.*
-
-
-## Steps to Run the Performance Test
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable Verbos Logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
deleted file mode 100644
index ee8b0763b62fc011f40f6197e929a68b48a93e47..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: MKL_NUM_THREADS
-          value: "1"
-        - name: TRAINING_ROLE
-          value: "PSERVER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        command: ["paddle_k8s", "start_fluid"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
deleted file mode 100644
index 3d56caac009464d1073423bb63abff1f8b0cf28f..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16job
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_fluid"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: TRAINING_ROLE
-          value: "TRAINER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/run_vgg_dist.sh b/benchmark/cluster/vgg16/run_vgg_dist.sh
deleted file mode 100644
index 8c0501439e9d5fa175f5aa9b62d286e690a10904..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# Update to point to the source file.
-VGG_SRC="vgg16_fluid.py"
-
-export TRAINING_ROLE=PSERVER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
-
-# Need to wait for the ps to start first.
-sleep 10
-echo "done start ps"
-
-export TRAINING_ROLE=TRAINER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
-CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s
deleted file mode 100644
index 4fc263d5f681aeabfa71f1758714d269d987b272..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_k8s
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-check_trainer_ret() {
-  ret=$1
-  stdbuf -oL echo "job returned $ret...setting pod return message..."
-  stdbuf -oL echo "==============================="
-
-  if [ $ret -eq 136 ] ; then
-    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
-  elif [ $ret -eq 139 ] ; then
-    echo "Segmentation Fault" > /dev/termination-log
-  elif [ $ret -eq 1 ] ; then
-    echo "General Error" > /dev/termination-log
-  elif [ $ret -eq 134 ] ; then
-    echo "Program Abort" > /dev/termination-log
-  fi
-  stdbuf -oL echo "termination log wroted..."
-  exit $ret
-}
-
-g_pservers=""
-g_trainers=""
-
-wait_running_pods(){
-  pserver_label="tf-job-pserver=${JOB_NAME}"
-  trainer_label="tf-job-trainer=${JOB_NAME}"
-
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
-
-  g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
-  g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
-}
-
-start_tf_pserver(){
-  wait_running_pods
-
-  label="tf-job-pserver=${JOB_NAME}"
-  pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-}
-
-start_tf_trainer(){
-  wait_running_pods
-
-  label="tf-job-trainer=${JOB_NAME}"
-  trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-  check_trainer_ret $?
-}
-
-start_tf(){
-    if [[ "${TF_JOB_NAME}" == "worker" ]]; then
-        start_tf_trainer
-    else
-        start_tf_pserver
-    fi
-}
-
-usage() {
-    echo "usage: tf_k8s [<args>]:"
-    echo "  start_tf         Start tensorflow jobs"
-}
-
-case "$1" in
-    start_tf)
-        start_tf
-        ;;
-    --help)
-        usage
-        ;;
-    *)
-        usage
-        ;;
-esac
diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml
deleted file mode 100644
index 5e37c700819119c8af05c40fe4b8d13911efc3e1..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-tf-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        tf-job-pserver: vgg16job-tf
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: TF_JOB_NAME 
-          value: "ps"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml
deleted file mode 100644
index 08795df3addfa7b618db24a65e57be190e268f06..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-tf-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        tf-job-trainer: vgg16job-tf
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: TF_JOB_NAME 
-          value: "worker"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
deleted file mode 100644
index dd1271e0cf399184134c06b3200ee1202c65cef0..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16v2job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16v2job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "python train.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        command: ["paddle_k8s", "start_pserver"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
deleted file mode 100644
index 12c8964066cbcfe8d2a44de2f51a3d12ea422fe2..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16v2job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16v2job
-    spec:
-      imagePullSecrets:
-        - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_trainer", "v2"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: BATCH_SIZE
-          value: "256"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "2"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
deleted file mode 100644
index e9360ab4c79d23bdf9f84d0c0d407af6d39bde3e..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ /dev/null
@@ -1,312 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-import argparse
-import functools
-import os
-from paddle.fluid import debuger
-
-
-def str2bool(v):
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=16, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument('--device_id', type=int, default=0, help="The device id.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NCHW',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, now only support NCHW.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='flowers',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-parser.add_argument(
-    '--local',
-    type=str2bool,
-    default=True,
-    help='Whether to run as local mode.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--trainer_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--profile", action='store_true', help="If set, profile a few steps.")
-
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    return fc2
-
-
-def main():
-    if args.data_set == "cifar10":
-        classdim = 10
-        if args.data_format == 'NCHW':
-            data_shape = [3, 32, 32]
-        else:
-            data_shape = [32, 32, 3]
-    else:
-        classdim = 102
-        if args.data_format == 'NCHW':
-            data_shape = [3, 224, 224]
-        else:
-            data_shape = [224, 224, 3]
-
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(batch_acc)
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-    # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
-        args.device_id)
-    exe = fluid.Executor(place)
-
-    # test
-    def test(exe):
-        test_pass_acc = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            outs = exe.run(inference_program,
-                           feed={"pixel": img_data,
-                                 "label": y_data},
-                           fetch_list=[batch_acc, batch_size])
-            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-
-        return test_pass_acc.eval()
-
-    def train_loop(exe, trainer_prog):
-        iters = 0
-        ts = time.time()
-        train_pass_acc = fluid.average.WeightedAverage()
-        for pass_id in range(args.num_passes):
-            # train
-            start_time = time.time()
-            num_samples = 0
-            train_pass_acc.reset()
-
-            def run_step(batch_id, data):
-                img_data = np.array(
-                    map(lambda x: x[0].reshape(data_shape), data)).astype(
-                        "float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                y_data = y_data.reshape([-1, 1])
-
-                loss, acc, b_size = exe.run(
-                    trainer_prog,
-                    feed={"pixel": img_data,
-                          "label": y_data},
-                    fetch_list=[avg_cost, batch_acc, batch_size])
-                return loss, acc, b_size
-
-            if args.profile:
-                with profiler.profiler('All', 'total',
-                                       '/tmp/profile_vgg_%d' % args.task_index):
-                    for batch_id, data in enumerate(train_reader()):
-                        if batch_id > 5: break
-                        run_step(batch_id, data)
-
-            total_time = 0.0
-            count = 0
-            for batch_id, data in enumerate(train_reader()):
-                ts = time.time()
-                loss, acc, b_size = run_step(batch_id, data)
-                iters += 1
-                num_samples += len(data)
-                train_pass_acc.add(value=acc, weight=b_size)
-
-                duration = time.time() - ts
-                total_time += duration
-                count += len(data)
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    "Speed = %.2f (%.2f) img/s" % (pass_id, iters, loss, acc,
-                                                   len(data) / duration,
-                                                   count / total_time)
-                )  # The accuracy is the accumulation of batches, but not the current batch.
-
-            pass_elapsed = time.time() - start_time
-            pass_train_acc = train_pass_acc.eval()
-            pass_test_acc = test(exe)
-            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
-                  "Train accuracy = %f, Test accuracy = %f\n" %
-                  (args.task_index, pass_id, num_samples / pass_elapsed,
-                   pass_train_acc, pass_test_acc))
-
-    if args.local:
-        # Parameter initialization
-        exe.run(fluid.default_startup_program())
-
-        # data reader
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                else paddle.dataset.flowers.train(),
-                buf_size=5120),
-            batch_size=args.batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            batch_size=args.batch_size)
-        train_loop(exe, fluid.default_main_program())
-    else:
-        trainers = int(os.getenv("TRAINERS"))  # total trainer count
-        print("trainers total: ", trainers)
-
-        training_role = os.getenv(
-            "TRAINING_ROLE",
-            "TRAINER")  # get the training role: trainer/pserver
-
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            trainer_id=args.task_index,
-            pservers=args.ps_hosts,
-            trainers=trainers)
-
-        if training_role == "PSERVER":
-            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
-                "PADDLE_INIT_PORT")
-            if not current_endpoint:
-                print("need env SERVER_ENDPOINT")
-                exit(1)
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            # Parameter initialization
-            exe.run(fluid.default_startup_program())
-
-            # data reader
-            train_reader = paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                    else paddle.dataset.flowers.train(),
-                    buf_size=5120),
-                batch_size=args.batch_size)
-            test_reader = paddle.batch(
-                paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
-                paddle.dataset.flowers.test(),
-                batch_size=args.batch_size)
-
-            trainer_prog = t.get_trainer_program()
-            feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-            # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
-            exe.run(fluid.default_startup_program())
-            train_loop(exe, trainer_prog)
-        else:
-            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == "__main__":
-    print_arguments()
-    main()
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
deleted file mode 100644
index 2d220478acae46566760209dbc012cff316946aa..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ /dev/null
@@ -1,366 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in TensorFlow
-You can get distribution example template structure here:
-https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
-https://www.tensorflow.org/deploy/distributed
-"""
-
-import tensorflow as tf
-import paddle.v2 as paddle
-import numpy as np
-import argparse
-import time
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NHWC',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, NCHW=[batch, channels, height, width].'
-    'Only support NHWC right now.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--worker_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--job_name", type=str, default="", help="One of 'worker', 'ps'")
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-
-args = parser.parse_args()
-
-
-class VGG16Model(object):
-    def __init__(self):
-        self.parameters = []
-
-    def batch_norm_relu(self, inputs, is_training):
-        """Performs a batch normalization followed by a ReLU."""
-        # We set fused=True for a significant speed boost. See
-        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
-        inputs = tf.layers.batch_normalization(
-            inputs=inputs,
-            axis=1 if args.data_format == 'NCHW' else -1,
-            momentum=0.9,
-            epsilon=1e-05,
-            center=True,
-            scale=True,
-            training=is_training,
-            fused=True)
-        inputs = tf.nn.relu(inputs)
-        return inputs
-
-    def conv_bn_layer(self,
-                      name,
-                      images,
-                      kernel_shape,
-                      is_training,
-                      drop_rate=0.0):
-        with tf.name_scope(name) as scope:
-            kernel = tf.Variable(
-                tf.truncated_normal(
-                    kernel_shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            conv = tf.nn.conv2d(
-                images,
-                kernel, [1, 1, 1, 1],
-                data_format=args.data_format,
-                padding='SAME')
-            biases = tf.Variable(
-                tf.constant(
-                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(conv, biases)
-            out = self.batch_norm_relu(out, is_training)
-            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
-            return out
-
-    def fc_layer(self, name, inputs, shape):
-        with tf.name_scope(name) as scope:
-            fc_w = tf.Variable(
-                tf.truncated_normal(
-                    shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            fc_b = tf.Variable(
-                tf.constant(
-                    0.0, shape=[shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
-            return out
-
-    def network(self, images, class_dim, is_training):
-        """ VGG16 model structure.
-
-            TODO(kuke): enable this network to support the 'NCHW' data format
-        """
-
-        # conv1
-        conv1_1 = self.conv_bn_layer(
-            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
-        conv1_2 = self.conv_bn_layer(
-            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
-        # pool1
-        pool1 = tf.nn.max_pool(
-            conv1_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool1')
-        # conv2
-        conv2_1 = self.conv_bn_layer(
-            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
-        conv2_2 = self.conv_bn_layer(
-            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
-        # pool2
-        pool2 = tf.nn.max_pool(
-            conv2_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool2')
-        # conv3
-        conv3_1 = self.conv_bn_layer(
-            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
-        conv3_2 = self.conv_bn_layer(
-            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
-        conv3_3 = self.conv_bn_layer(
-            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
-        # pool3
-        pool3 = tf.nn.max_pool(
-            conv3_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool3')
-        # conv4
-        conv4_1 = self.conv_bn_layer(
-            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
-        conv4_2 = self.conv_bn_layer(
-            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv4_3 = self.conv_bn_layer(
-            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool4
-        pool4 = tf.nn.max_pool(
-            conv4_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # conv5
-        conv5_1 = self.conv_bn_layer(
-            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_2 = self.conv_bn_layer(
-            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_3 = self.conv_bn_layer(
-            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool5
-        pool5 = tf.nn.max_pool(
-            conv5_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # flatten
-        shape = int(np.prod(pool5.get_shape()[1:]))
-        pool5_flat = tf.reshape(pool5, [-1, shape])
-        # fc1
-        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
-        fc1 = self.fc_layer('fc1', drop, [shape, 512])
-        # fc2
-        bn = self.batch_norm_relu(fc1, is_training)
-        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
-        fc2 = self.fc_layer('fc2', drop, [512, 512])
-
-        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
-
-        return fc3
-
-
-def run_benchmark(cluster_spec, server):
-    """Run benchmark on cifar10 or flowers."""
-
-    if args.data_set == "cifar10":
-        class_dim = 10
-        raw_shape = (3, 32, 32)
-        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
-            None, 3, 32, 32)
-    else:
-        class_dim = 102
-        raw_shape = (3, 224, 224)
-        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
-            None, 3, 224, 224)
-
-    device = tf.train.replica_device_setter(
-        worker_device="/job:worker/task:{}".format(args.task_index),
-        cluster=cluster_spec)
-
-    with tf.device(device):
-        images = tf.placeholder(tf.float32, shape=dat_shape)
-        labels = tf.placeholder(tf.int64, shape=(None, ))
-        is_training = tf.placeholder('bool')
-        onehot_labels = tf.one_hot(labels, depth=class_dim)
-
-        vgg16 = VGG16Model()
-        logits = vgg16.network(images, class_dim, is_training)
-        loss = tf.losses.softmax_cross_entropy(
-            onehot_labels=onehot_labels, logits=logits)
-        avg_loss = tf.reduce_mean(loss)
-
-        correct = tf.equal(tf.argmax(logits, 1), labels)
-        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
-        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
-        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        global_step = tf.Variable(0, name='global_step', trainable=False)
-        with tf.control_dependencies(update_ops):
-            train_op = optimizer.minimize(avg_loss, global_step=global_step)
-
-        summary_op = tf.summary.merge_all()
-        init_op = tf.global_variables_initializer()
-
-    # data reader
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-
-    # test
-    def test():
-        test_accs = []
-        for batch_id, data in enumerate(test_reader()):
-            test_images = np.array(
-         map(lambda x: np.transpose(x[0].reshape(raw_shape),
-         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
-            test_accs.append(
-                accuracy.eval(feed_dict={
-                    images: test_images,
-                    labels: test_labels,
-                    is_training: False
-                }))
-        return np.mean(test_accs)
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1,
-        inter_op_parallelism_threads=1,
-        log_device_placement=True)
-    config.gpu_options.allow_growth = True
-
-    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
-
-    with tf.train.MonitoredTrainingSession(
-            master=server.target,
-            is_chief=(args.task_index == 0),
-            hooks=hooks,
-            config=config) as sess:
-        iters, num_samples, start_time = 0, 0, 0.0
-        for pass_id in range(args.num_passes):
-            # train
-            num_samples = 0
-            start_time = time.time()
-            for batch_id, data in enumerate(train_reader()):
-                train_images = np.array(
-                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
-                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-                train_labels = np.array(map(lambda x: x[1], data)).astype(
-                    'int64')
-                iter_begin_time = time.time()
-                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
-                                        feed_dict={
-                                            images: train_images,
-                                            labels: train_labels,
-                                            is_training: True
-                                        })
-                iters += 1
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
-                    % (pass_id, iters, loss, acc,
-                       len(data) / (time.time() - iter_begin_time)))
-                num_samples += len(data)
-            train_elapsed = time.time() - start_time
-            # test
-            pass_test_acc = test()
-            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
-                  (pass_id, num_samples / train_elapsed, pass_test_acc))
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    print_arguments()
-
-    ps_hosts = args.ps_hosts.split(",")
-    worker_hosts = args.worker_hosts.split(",")
-
-    # Create a cluster from the parameter server and worker hosts.
-    cluster_spec = tf.train.ClusterSpec({
-        "ps": ps_hosts,
-        "worker": worker_hosts
-    })
-
-    # Create and start a server for the local task.
-    server = tf.train.Server(
-        cluster_spec, job_name=args.job_name, task_index=args.task_index)
-
-    if args.job_name == "ps":
-        print("start pserver")
-        server.join()
-    elif args.job_name == "worker":
-        print("start worker")
-        run_benchmark(cluster_spec, server)
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
deleted file mode 100644
index 1a66af32d7131997c63bd3c3042875f33a467084..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import gzip
-
-import paddle.v2.dataset.cifar as cifar
-import paddle.v2 as paddle
-import time
-import os
-
-DATA_DIM = 3 * 32 * 32
-CLASS_DIM = 10
-BATCH_SIZE = os.getenv("BATCH_SIZE")
-if BATCH_SIZE:
-    BATCH_SIZE = int(BATCH_SIZE)
-else:
-    BATCH_SIZE = 128
-print "batch_size", BATCH_SIZE
-NODE_COUNT = int(os.getenv("TRAINERS"))
-ts = 0
-
-
-def vgg(input, nums, class_dim):
-    def conv_block(input, num_filter, groups, num_channels=None):
-        return paddle.networks.img_conv_group(
-            input=input,
-            num_channels=num_channels,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act=paddle.activation.Relu(),
-            pool_type=paddle.pooling.Max())
-
-    assert len(nums) == 5
-    # the channel of input feature is 3
-    conv1 = conv_block(input, 64, nums[0], 3)
-    conv2 = conv_block(conv1, 128, nums[1])
-    conv3 = conv_block(conv2, 256, nums[2])
-    conv4 = conv_block(conv3, 512, nums[3])
-    conv5 = conv_block(conv4, 512, nums[4])
-
-    fc_dim = 512
-    fc1 = paddle.layer.fc(input=conv5,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    fc2 = paddle.layer.fc(input=fc1,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    out = paddle.layer.fc(input=fc2,
-                          size=class_dim,
-                          act=paddle.activation.Softmax())
-    return out
-
-
-def vgg13(input, class_dim):
-    nums = [2, 2, 2, 2, 2]
-    return vgg(input, nums, class_dim)
-
-
-def vgg16(input, class_dim):
-    nums = [2, 2, 3, 3, 3]
-    return vgg(input, nums, class_dim)
-
-
-def vgg19(input, class_dim):
-    nums = [2, 2, 4, 4, 4]
-    return vgg(input, nums, class_dim)
-
-
-def main():
-    global ts
-    paddle.init(use_gpu=False)
-    image = paddle.layer.data(
-        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
-    lbl = paddle.layer.data(
-        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
-
-    extra_layers = None
-    # NOTE: for v2 distributed training need averaging updates.
-    learning_rate = 1e-3 / NODE_COUNT
-    out = vgg16(image, class_dim=CLASS_DIM)
-    cost = paddle.layer.classification_cost(input=out, label=lbl)
-
-    # Create parameters
-    parameters = paddle.parameters.create(cost)
-
-    # Create optimizer
-    optimizer = paddle.optimizer.Momentum(
-        momentum=0.9,
-        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
-                                                         BATCH_SIZE),
-        learning_rate=learning_rate / BATCH_SIZE,
-        learning_rate_decay_a=0.1,
-        learning_rate_decay_b=128000 * 35,
-        learning_rate_schedule="discexp", )
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            cifar.train10(),
-            # To use other data, replace the above line with:
-            # reader.train_reader('train.list'),
-            buf_size=1000),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        cifar.test10(),
-        # To use other data, replace the above line with:
-        # reader.test_reader('val.list'),
-        batch_size=BATCH_SIZE)
-
-    # Create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer,
-                                 extra_layers=extra_layers,
-                                 is_local=False)
-
-    # End batch and end pass event handler
-    def event_handler(event):
-        global ts, ts_pass
-        if isinstance(event, paddle.event.BeginPass):
-            ts_pass = time.time()
-        if isinstance(event, paddle.event.BeginIteration):
-            ts = time.time()
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 1 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics,
-                    time.time() - ts)
-        if isinstance(event, paddle.event.EndPass):
-            print "Pass %d end, spent: %f" % (event.pass_id,
-                                              time.time() - ts_pass)
-            result = trainer.test(reader=test_reader)
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
-    trainer.train(
-        reader=train_reader, num_passes=200, event_handler=event_handler)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 1d8f27440d0f1438e0520684ee3e90e8a5891a17..30b070e4acac60caa97a4e8ffd07462cb347ee93 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -94,6 +94,10 @@ def parse_args():
         '--memory_optimize',
         action='store_true',
         help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='If set ommit the actual read data operators.')
     parser.add_argument(
         '--update_method',
         type=str,
@@ -198,6 +202,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
         exe.run(train_prog)
         return
 
+    if args.use_fake_data:
+        raise Exception(
+            "fake data is not supported in single GPU test for now.")
+
     place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
     exe = fluid.Executor(place)
     exe.run(startup_prog)
@@ -244,7 +252,31 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
 def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                    batch_acc, args, train_prog, startup_prog, nccl_id_var,
                    num_trainers, trainer_id):
+    feed_var_list = [
+        var for var in train_prog.global_block().vars.itervalues()
+        if var.is_data
+    ]
+    # generate fake:
+    if args.use_fake_data:
+        for var in feed_var_list:
+            v = startup_prog.global_block().clone_variable(var)
+            var.persistable = True
+            v.persistable = True
+
+            real_shape = list(var.shape)
+            real_shape[0] = args.batch_size / args.gpus
+            startup_prog.global_block().append_op(
+                outputs={"Out": v},
+                type="fill_constant",
+                attrs={"shape": real_shape,
+                       "value": 1.0,
+                       "dtype": var.dtype})
+
     place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    if nccl_id_var and trainer_id == 0:
+        #FIXME(wuyi): wait other trainer to start listening
+        time.sleep(30)
+
     startup_exe = fluid.Executor(place)
     startup_exe.run(startup_prog)
     strategy = fluid.ExecutionStrategy()
@@ -256,10 +288,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
         exec_strategy=strategy,
         num_trainers=num_trainers,
         trainer_id=trainer_id)
-    feed_var_list = [
-        var for var in train_prog.global_block().vars.itervalues()
-        if var.is_data
-    ]
+
     feeder = fluid.DataFeeder(feed_var_list, place)
     for pass_id in range(args.pass_num):
         num_samples = 0
@@ -271,7 +300,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                 num_samples = 0
             if iters == args.iterations:
                 break
-            loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+            if args.use_fake_data:
+                loss, = exe.run([avg_loss.name])
+            else:
+                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
             if args.update_method == "pserver":
                 exe.bcast_params()
             num_samples += len(data)
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
index 3dbb4b8c5dd13657f8d1853003b321ad047e1349..39ba207fd96f71563504017e77dc0e87c249b3f8 100644
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -112,6 +112,7 @@ def gen_job():
     envs.append({"name": "PSERVERS", "value": str(args.pservers)})
     envs.append({"name": "ENTRY", "value": args.entry})
     envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
+    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
     # NOTE: these directories below are cluster specific, please modify
     # this settings before you run on your own cluster.
     envs.append({
diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py
index b64a7f78ff10d03987ea4a8c13a0e34bb433f64c..2d09d940a5ee638e4b55405d05924e2d76006cfc 100644
--- a/benchmark/fluid/kube_templates/__init__.py
+++ b/benchmark/fluid/kube_templates/__init__.py
@@ -54,5 +54,13 @@ envs = [
                 "fieldPath": "status.podIP"
             }
         }
+    },
+    {
+        "name": "PADDLE_CURRENT_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
     }
 ]

Trainer Counter	1	10	20	30	40	50	60	70	80	90	100
PaddlePaddle Fluid	-	-	-	-	-	-	-	-	-	-	-
PaddlePaddle v2	-	-	-	-	-	-	-	-	-	-	-
TensorFlow	-	-	-	-	-	-	-	-	-	-	-
Batch Size	32	64	128	256
PaddlePaddle Fluid	15.44	16.32	16.74	16.79
PaddlePaddle v2	15.97	17.04	17.60	17.83
TensorFlow	9.09	9.10	9.24	8.66
Batch Size	32	64	128	256
PaddlePaddle Fluid	190.20	222.15	247.40	258.18
PaddlePaddle v2	170.96	233.71	256.14	329.23
TensorFlow	-	-	-	-
Trainer Count	20	40	80	100
PaddlePaddle Fluid	263.29 (78.64%)	518.80 (77.47%)	836.26 (62.44%)	1019.29 (60.89%)
PaddlePaddle v2 (need more tests)	326.85 (92.85%)	534.58 (75.93%)	853.30 (60.60%)	1041.99 (59.20%)
TensorFlow	-	-	-	-
PServer Count	3	6	10	20
PaddlePaddle Fluid(should fix in next PR)	589.1	592.6	656.4	655.8
PaddlePaddle v2 (need more tests)	593.4	791.3	729.7	821.7
TensorFlow	-	-	-	-