diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
index c34f7e8fcff070205ba54f8ae6fa5aa27c4dca91..54d1b09a0f210f5b6a6c112fe1224e1c03bf6929 100644
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -1,13 +1,16 @@
 FROM python:2.7.14
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev && \
-chmod +x /usr/bin/paddle_k8s
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
 # NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
 #       so we must build one with distribute support to install in this image.
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
-ENV LD_LIBRARY_PATH=/usr/local/lib
+RUN pip install paddlepaddle
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
+RUN pip uninstall -y paddlepaddle
 
+# below lines may change a lot for debugging
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && \
+chmod +x /usr/bin/paddle_k8s
+ENV LD_LIBRARY_PATH=/usr/local/lib
 ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index 69a242e3053df1904f4278b3c5960c1e872ecb2e..6d309217f86dadf9819632ffadcd621864ef8cef 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -40,13 +40,13 @@
 - Batch Size: 128
 - Metrics: samples / sec
 
-| Trainer Counter | 20 | 40 | 80 | 100 |
+| Trainer Count | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
 | PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
 | TensorFlow | - | - | - | - |
 
-### Different Pserver Number
+### Different Pserver Count
 
 - Trainer Count: 100
 - Batch Size: 128
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 87a151db21a85d29433e8b8b7a7bf236572fe1ad..e89b96e4a660a18343ca9928b084377f42735a17 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -50,6 +50,11 @@ parser.add_argument(
     default='CPU',
     choices=['CPU', 'GPU'],
     help="The device type.")
+parser.add_argument(
+    '--device_id',
+    type=int,
+    default=0,
+    help="The device id.")
 parser.add_argument(
     '--data_format',
     type=str,
@@ -135,7 +140,7 @@ def main():
     optimize_ops, params_grads = optimizer.minimize(avg_cost)
 
     # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(args.device_id)
     exe = fluid.Executor(place)
 
     # test