update for today

bd64719a · typhoonzero · 7aed1c13 · bd64719a · bd64719a · bd64719a
5 changed file
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -2,41 +2,57 @@
 ## Test Result
-### Single node single thread
+### Hardware Infomation
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz		: 2101.000
+- cache size	: 20480 KB
+### Single Node Single Thread
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
 | Batch Size | 32 | 64 | 128 | 256 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | 16.74 | - |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
-| PaddlePaddle v2 | - | - | 17.60 | - |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
 | TensorFlow | - | - | - | - |
 ### different batch size
 - PServer Count: 10
 - Trainer Count: 20
+- Per trainer CPU Core: 1
 - Metrics: samples / sec
 | Batch Size | 32 | 64 | 128 | 256 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | 247.40 | - | - |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
-| PaddlePaddle v2 | - | - | 256.14 | - |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
 | TensorFlow | - | - | - | - |
-### different pserver number
- Trainer Count: 100
+### Accelerate rate
- Batch Size: 64
- Metrics: mini-batch / sec
-| PServer Count | 10 | 20 | 40 | 60 |
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+| Trainer Counter | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 |
-| PaddlePaddle v2 | - | - | - | - |
+| PaddlePaddle v2 | 356.28 | - | - | 1041.99 |
 | TensorFlow | - | - | - | - |
-### Accelerate rate
+### different pserver number
-| Trainer Counter | 20 | 40 | 80 | 100 |
+- Trainer Count: 100
+- Batch Size: 128
+- Metrics: mini-batch / sec
+| PServer Count | 10 | 20 | 40 | 60 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | - | - | - | - |
 | PaddlePaddle v2 | - | - | - | - |

--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -30,7 +30,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 256"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT

--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -22,7 +22,7 @@ spec:
        - name: PADDLE_JOB_NAME
          value: vgg16v2job
        - name: BATCH_SIZE
-          value: "128"
+          value: "256"
        - name: TRAINERS
          value: "20"
        - name: PSERVERS

--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
+import paddle.v2.fluid.profiler as profiler
 import argparse
 import functools
 import os
@@ -160,6 +161,7 @@ def main():
            start_time = time.time()
            num_samples = 0
            accuracy.reset(exe)
+            with profiler.profiler("CPU", 'total') as prof:
                for batch_id, data in enumerate(train_reader()):
                    ts = time.time()
                    img_data = np.array(
@@ -211,6 +213,7 @@ def main():
        pserver_endpoints = ",".join(eplist)
        print("pserver endpoints: ", pserver_endpoints)
        trainers = int(os.getenv("TRAINERS"))  # total trainer count
+        print("trainers total: ", trainers)
        current_endpoint = os.getenv(
            "POD_IP") + ":6174"  # current pserver endpoint
        training_role = os.getenv(

--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@@ -26,6 +26,7 @@ if BATCH_SIZE:
    BATCH_SIZE = int(BATCH_SIZE)
 else:
    BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
 NODE_COUNT = int(os.getenv("TRAINERS"))
 ts = 0