diff --git a/benchmark/cluster/vgg16/fluid/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
similarity index 91%
rename from benchmark/cluster/vgg16/fluid/Dockerfile
rename to benchmark/cluster/vgg16/Dockerfile
index 711076b09e316292007acc40bedc1987d06c0065..dfaffb8c213f9ab6dac1f7e0f8fd6f7ebc360739 100644
--- a/benchmark/cluster/vgg16/fluid/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -12,4 +12,4 @@ ENV LD_LIBRARY_PATH=/usr/local/lib
 ADD reader.py /workspace/
 RUN python /workspace/reader.py
 
-ADD vgg16.py /workspace/
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..18128e52761715e4380d72c89bf53f7346f930ec
--- /dev/null
+++ b/benchmark/cluster/vgg16/README.md
@@ -0,0 +1,58 @@
+# Performance for distributed vgg16
+
+## Test Result
+
+### Single node single thread
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | 16.74 | - |
+| PaddlePaddle v2 | - | - | 17.60 | - |
+| TensorFlow | - | - | - | - |
+
+### different batch size
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | 247.40 | - | - |
+| PaddlePaddle v2 | - | - | 256.14 | - |
+| TensorFlow | - | - | - | - |
+
+### different pserver number
+
+- Trainer Count: 100
+- Batch Size: 64
+- Metrics: mini-batch / sec
+
+| PServer Count | 10 | 20 | 40 | 60 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Accelerate rate
+
+| Trainer Counter | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+
+## Steps to run the performance test
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable verbos logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid/README.md b/benchmark/cluster/vgg16/fluid/README.md
deleted file mode 100644
index 71a3a934d20b0328ec41dbc34ca3b384749ca49a..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Fluid distributed training perf test
-
-## Steps to get started
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable verbos logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid/pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
similarity index 89%
rename from benchmark/cluster/vgg16/fluid/pserver.yaml
rename to benchmark/cluster/vgg16/fluid_pserver.yaml
index e1a58260af0325a313934cfa3730801190cadcce..ee8b0763b62fc011f40f6197e929a68b48a93e47 100644
--- a/benchmark/cluster/vgg16/fluid/pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@@ -14,7 +14,7 @@ spec:
       - name: job-registry-secret
       containers:
       - name: pserver
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         ports:
         - name: jobport-30236
@@ -33,7 +33,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
@@ -53,7 +53,7 @@ spec:
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:
diff --git a/benchmark/cluster/vgg16/fluid/trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
similarity index 87%
rename from benchmark/cluster/vgg16/fluid/trainer.yaml
rename to benchmark/cluster/vgg16/fluid_trainer.yaml
index c8e26d4b511f4f659fc08229cb463bd77a6f724b..0a0ed25ebe43c4cc0d5ab0b72cf36c936fcce802 100644
--- a/benchmark/cluster/vgg16/fluid/trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -15,7 +15,7 @@ spec:
       hostNetwork: true
       containers:
       - name: trainer
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         command: ["paddle_k8s", "start_fluid"]
         env:
@@ -30,7 +30,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
@@ -50,7 +50,7 @@ spec:
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:
diff --git a/benchmark/cluster/vgg16/fluid/k8s_tools.py b/benchmark/cluster/vgg16/k8s_tools.py
similarity index 100%
rename from benchmark/cluster/vgg16/fluid/k8s_tools.py
rename to benchmark/cluster/vgg16/k8s_tools.py
diff --git a/benchmark/cluster/vgg16/fluid/paddle_k8s b/benchmark/cluster/vgg16/paddle_k8s
similarity index 100%
rename from benchmark/cluster/vgg16/fluid/paddle_k8s
rename to benchmark/cluster/vgg16/paddle_k8s
diff --git a/benchmark/cluster/vgg16/fluid/reader.py b/benchmark/cluster/vgg16/reader.py
similarity index 100%
rename from benchmark/cluster/vgg16/fluid/reader.py
rename to benchmark/cluster/vgg16/reader.py
diff --git a/benchmark/cluster/vgg16/v2/Dockerfile b/benchmark/cluster/vgg16/v2/Dockerfile
deleted file mode 100644
index 5f129a8e323a72bd1f9e1ca9a2046ee2149f3a2c..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2/Dockerfile
+++ /dev/null
@@ -1,7 +0,0 @@
-FROM paddlepaddle/paddlecloud-job
-RUN mkdir -p /workspace
-ADD reader.py /workspace/
-RUN python /workspace/reader.py
-ADD vgg16.py /workspace/
-
-ADD vgg16_fluid.py /workspace
diff --git a/benchmark/cluster/vgg16/v2/reader.py b/benchmark/cluster/vgg16/v2/reader.py
deleted file mode 100644
index 16ac2dbcef4b758a2bf7a057a4a99e4ce7e136cb..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2/reader.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import random
-from paddle.v2.image import load_and_transform
-import paddle.v2 as paddle
-from multiprocessing import cpu_count
-
-
-def train_mapper(sample):
-    '''
-    map image path to type needed by model input layer for the training set
-    '''
-    img, label = sample
-    img = paddle.image.load_image(img)
-    img = paddle.image.simple_transform(img, 256, 224, True)
-    return img.flatten().astype('float32'), label
-
-
-def test_mapper(sample):
-    '''
-    map image path to type needed by model input layer for the test set
-    '''
-    img, label = sample
-    img = paddle.image.load_image(img)
-    img = paddle.image.simple_transform(img, 256, 224, True)
-    return img.flatten().astype('float32'), label
-
-
-def train_reader(train_list, buffered_size=1024):
-    def reader():
-        with open(train_list, 'r') as f:
-            lines = [line.strip() for line in f]
-            for line in lines:
-                img_path, lab = line.strip().split('\t')
-                yield img_path, int(lab)
-
-    return paddle.reader.xmap_readers(train_mapper, reader,
-                                      cpu_count(), buffered_size)
-
-
-def test_reader(test_list, buffered_size=1024):
-    def reader():
-        with open(test_list, 'r') as f:
-            lines = [line.strip() for line in f]
-            for line in lines:
-                img_path, lab = line.strip().split('\t')
-                yield img_path, int(lab)
-
-    return paddle.reader.xmap_readers(test_mapper, reader,
-                                      cpu_count(), buffered_size)
-
-
-if __name__ == '__main__':
-    #for im in train_reader('train.list'):
-    #    print len(im[0])
-    #for im in train_reader('test.list'):
-    #    print len(im[0])
-    paddle.dataset.cifar.train10()
diff --git a/benchmark/cluster/vgg16/v2/pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
similarity index 92%
rename from benchmark/cluster/vgg16/v2/pserver.yaml
rename to benchmark/cluster/vgg16/v2_pserver.yaml
index 943675e147212ebf9b2007b9f914bdc8d6d2ba4e..dd1271e0cf399184134c06b3200ee1202c65cef0 100644
--- a/benchmark/cluster/vgg16/v2/pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@@ -14,7 +14,7 @@ spec:
       - name: job-registry-secret
       containers:
       - name: pserver
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         ports:
         - name: jobport-30236
@@ -49,7 +49,7 @@ spec:
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:
diff --git a/benchmark/cluster/vgg16/v2/trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
similarity index 88%
rename from benchmark/cluster/vgg16/v2/trainer.yaml
rename to benchmark/cluster/vgg16/v2_trainer.yaml
index 3288fbae264ebf128be531e7c145b9e9d2851ab9..9d52e231f0e7e1804e515fb7f0de60e75635ae8b 100644
--- a/benchmark/cluster/vgg16/v2/trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -15,12 +15,14 @@ spec:
       hostNetwork: true
       containers:
       - name: trainer
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         command: ["paddle_k8s", "start_trainer", "v2"]
         env:
         - name: PADDLE_JOB_NAME
           value: vgg16v2job
+        - name: BATCH_SIZE
+          value: "128"
         - name: TRAINERS
           value: "20"
         - name: PSERVERS
@@ -28,7 +30,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
@@ -48,7 +50,7 @@ spec:
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:
diff --git a/benchmark/cluster/vgg16/fluid/vgg16.py b/benchmark/cluster/vgg16/vgg16_fluid.py
similarity index 100%
rename from benchmark/cluster/vgg16/fluid/vgg16.py
rename to benchmark/cluster/vgg16/vgg16_fluid.py
diff --git a/benchmark/cluster/vgg16/v2/vgg16.py b/benchmark/cluster/vgg16/vgg16_v2.py
similarity index 90%
rename from benchmark/cluster/vgg16/v2/vgg16.py
rename to benchmark/cluster/vgg16/vgg16_v2.py
index 0ffa9703b798c681ba2580a1a634aef0e0f06c9d..284dbec48dcb794f947a4a9c4af7949697cac8e9 100644
--- a/benchmark/cluster/vgg16/v2/vgg16.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@@ -16,12 +16,17 @@ import gzip
 
 import paddle.v2.dataset.cifar as cifar
 import paddle.v2 as paddle
-import reader
 import time
+import os
 
 DATA_DIM = 3 * 32 * 32
 CLASS_DIM = 10
-BATCH_SIZE = 128
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+    BATCH_SIZE = int(BATCH_SIZE)
+else:
+    BATCH_SIZE = 128
+NODE_COUNT = int(os.getenv("TRAINERS"))
 ts = 0
 
 
@@ -84,7 +89,8 @@ def main():
         name="label", type=paddle.data_type.integer_value(CLASS_DIM))
 
     extra_layers = None
-    learning_rate = 1e-3 / 20
+    # NOTE: for v2 distributed training need averaging updates.
+    learning_rate = 1e-3 / NODE_COUNT
     out = vgg16(image, class_dim=CLASS_DIM)
     cost = paddle.layer.classification_cost(input=out, label=lbl)
 
@@ -123,7 +129,9 @@ def main():
 
     # End batch and end pass event handler
     def event_handler(event):
-        global ts
+        global ts, ts_pass
+        if isinstance(event, paddle.event.BeginPass):
+            ts_pass = time.time()
         if isinstance(event, paddle.event.BeginIteration):
             ts = time.time()
         if isinstance(event, paddle.event.EndIteration):
@@ -132,9 +140,8 @@ def main():
                     event.pass_id, event.batch_id, event.cost, event.metrics,
                     time.time() - ts)
         if isinstance(event, paddle.event.EndPass):
-            with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
-                trainer.save_parameter_to_tar(f)
-
+            print "Pass %d end, spent: %f" % (event.pass_id,
+                                              time.time() - ts_pass)
             result = trainer.test(reader=test_reader)
             print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)