update for today

bd64719a · typhoonzero · 7aed1c13 · bd64719a · bd64719a · bd64719a
5 changed file
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -2,41 +2,57 @@
 ## Test Result
-### Single node single thread
+### Hardware Infomation
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz		: 2101.000
+- cache size	: 20480 KB
+### Single Node Single Thread
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
 | Batch Size | 32 | 64 | 128 | 256 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | 16.74 | - |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
-| PaddlePaddle v2 | - | - | 17.60 | - |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
 | TensorFlow | - | - | - | - |
 ### different batch size
 - PServer Count: 10
 - Trainer Count: 20
+- Per trainer CPU Core: 1
 - Metrics: samples / sec
 | Batch Size | 32 | 64 | 128 | 256 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | 247.40 | - | - |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
-| PaddlePaddle v2 | - | - | 256.14 | - |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
 | TensorFlow | - | - | - | - |
-### different pserver number
- Trainer Count: 100
+### Accelerate rate
- Batch Size: 64
- Metrics: mini-batch / sec
-| PServer Count | 10 | 20 | 40 | 60 |
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+| Trainer Counter | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 |
-| PaddlePaddle v2 | - | - | - | - |
+| PaddlePaddle v2 | 356.28 | - | - | 1041.99 |
 | TensorFlow | - | - | - | - |
-### Accelerate rate
+### different pserver number
-| Trainer Counter | 20 | 40 | 80 | 100 |
+- Trainer Count: 100
+- Batch Size: 128
+- Metrics: mini-batch / sec
+| PServer Count | 10 | 20 | 40 | 60 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | - | - | - | - |
 | PaddlePaddle v2 | - | - | - | - |

--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -30,7 +30,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 256"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT

--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -22,7 +22,7 @@ spec:
        - name: PADDLE_JOB_NAME
          value: vgg16v2job
        - name: BATCH_SIZE
-          value: "128"
+          value: "256"
        - name: TRAINERS
          value: "20"
        - name: PSERVERS

--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
+import paddle.v2.fluid.profiler as profiler
 import argparse
 import functools
 import os
@@ -160,24 +161,25 @@ def main():
            start_time = time.time()
            num_samples = 0
            accuracy.reset(exe)
-            for batch_id, data in enumerate(train_reader()):
+            with profiler.profiler("CPU", 'total') as prof:
-                ts = time.time()
+                for batch_id, data in enumerate(train_reader()):
-                img_data = np.array(
+                    ts = time.time()
-                    map(lambda x: x[0].reshape(data_shape), data)).astype(
+                    img_data = np.array(
-                        "float32")
+                        map(lambda x: x[0].reshape(data_shape), data)).astype(
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                            "float32")
-                y_data = y_data.reshape([-1, 1])
+                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                    y_data = y_data.reshape([-1, 1])
-                loss, acc = exe.run(trainer_prog,
-                                    feed={"pixel": img_data,
+                    loss, acc = exe.run(trainer_prog,
-                                          "label": y_data},
+                                        feed={"pixel": img_data,
-                                    fetch_list=[avg_cost] + accuracy.metrics)
+                                            "label": y_data},
-                iters += 1
+                                        fetch_list=[avg_cost] + accuracy.metrics)
-                num_samples += len(data)
+                    iters += 1
-                print(
+                    num_samples += len(data)
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
+                    print(
-                    % (pass_id, iters, loss, acc, time.time() - ts)
+                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
-                )  # The accuracy is the accumulation of batches, but not the current batch.
+                        % (pass_id, iters, loss, acc, time.time() - ts)
+                    )  # The accuracy is the accumulation of batches, but not the current batch.
            pass_elapsed = time.time() - start_time
            pass_train_acc = accuracy.eval(exe)
@@ -211,6 +213,7 @@ def main():
        pserver_endpoints = ",".join(eplist)
        print("pserver endpoints: ", pserver_endpoints)
        trainers = int(os.getenv("TRAINERS"))  # total trainer count
+        print("trainers total: ", trainers)
        current_endpoint = os.getenv(
            "POD_IP") + ":6174"  # current pserver endpoint
        training_role = os.getenv(

--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@@ -26,6 +26,7 @@ if BATCH_SIZE:
    BATCH_SIZE = int(BATCH_SIZE)
 else:
    BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
 NODE_COUNT = int(os.getenv("TRAINERS"))
 ts = 0