From bd64719a2f012af82dcac731179a998764d432b9 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Mon, 29 Jan 2018 20:42:29 +0800 Subject: [PATCH] update for today --- benchmark/cluster/vgg16/README.md | 44 +++++++++++++++------- benchmark/cluster/vgg16/fluid_trainer.yaml | 2 +- benchmark/cluster/vgg16/v2_trainer.yaml | 2 +- benchmark/cluster/vgg16/vgg16_fluid.py | 39 ++++++++++--------- benchmark/cluster/vgg16/vgg16_v2.py | 1 + 5 files changed, 54 insertions(+), 34 deletions(-) diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md index 18128e52761..c1e85a2c407 100644 --- a/benchmark/cluster/vgg16/README.md +++ b/benchmark/cluster/vgg16/README.md @@ -2,41 +2,57 @@ ## Test Result -### Single node single thread +### Hardware Infomation + +- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz +- cpu MHz : 2101.000 +- cache size : 20480 KB + +### Single Node Single Thread + +- PServer Count: 10 +- Trainer Count: 20 +- Metrics: samples / sec | Batch Size | 32 | 64 | 128 | 256 | | -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | - | - | 16.74 | - | -| PaddlePaddle v2 | - | - | 17.60 | - | +| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 | +| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 | | TensorFlow | - | - | - | - | ### different batch size - PServer Count: 10 - Trainer Count: 20 +- Per trainer CPU Core: 1 - Metrics: samples / sec | Batch Size | 32 | 64 | 128 | 256 | | -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | - | 247.40 | - | - | -| PaddlePaddle v2 | - | - | 256.14 | - | +| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 | +| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 | | TensorFlow | - | - | - | - | -### different pserver number -- Trainer Count: 100 -- Batch Size: 64 -- Metrics: mini-batch / sec +### Accelerate rate -| PServer Count | 10 | 20 | 40 | 60 | +- Pserver Count: 20 +- Batch Size: 128 +- Metrics: samples / sec + +| Trainer Counter | 20 | 40 | 80 | 100 | | -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | - | - | - | - | -| PaddlePaddle v2 | - | - | - | - | +| PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 | +| PaddlePaddle v2 | 356.28 | - | - | 1041.99 | | TensorFlow | - | - | - | - | -### Accelerate rate +### different pserver number -| Trainer Counter | 20 | 40 | 80 | 100 | +- Trainer Count: 100 +- Batch Size: 128 +- Metrics: mini-batch / sec + +| PServer Count | 10 | 20 | 40 | 60 | | -- | -- | -- | -- | -- | | PaddlePaddle Fluid | - | - | - | - | | PaddlePaddle v2 | - | - | - | - | diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml index 0a0ed25ebe4..2f6a87ab02a 100644 --- a/benchmark/cluster/vgg16/fluid_trainer.yaml +++ b/benchmark/cluster/vgg16/fluid_trainer.yaml @@ -30,7 +30,7 @@ spec: - name: TOPOLOGY value: "" - name: ENTRY - value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128" + value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 256" - name: TRAINER_PACKAGE value: "/workspace" - name: PADDLE_INIT_PORT diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml index 9d52e231f0e..997bbc81c99 100644 --- a/benchmark/cluster/vgg16/v2_trainer.yaml +++ b/benchmark/cluster/vgg16/v2_trainer.yaml @@ -22,7 +22,7 @@ spec: - name: PADDLE_JOB_NAME value: vgg16v2job - name: BATCH_SIZE - value: "128" + value: "256" - name: TRAINERS value: "20" - name: PSERVERS diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py index 88d6d79cc06..51a01af6722 100644 --- a/benchmark/cluster/vgg16/vgg16_fluid.py +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -20,6 +20,7 @@ import numpy as np import paddle.v2 as paddle import paddle.v2.fluid as fluid import paddle.v2.fluid.core as core +import paddle.v2.fluid.profiler as profiler import argparse import functools import os @@ -160,24 +161,25 @@ def main(): start_time = time.time() num_samples = 0 accuracy.reset(exe) - for batch_id, data in enumerate(train_reader()): - ts = time.time() - img_data = np.array( - map(lambda x: x[0].reshape(data_shape), data)).astype( - "float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([-1, 1]) - - loss, acc = exe.run(trainer_prog, - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[avg_cost] + accuracy.metrics) - iters += 1 - num_samples += len(data) - print( - "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" - % (pass_id, iters, loss, acc, time.time() - ts) - ) # The accuracy is the accumulation of batches, but not the current batch. + with profiler.profiler("CPU", 'total') as prof: + for batch_id, data in enumerate(train_reader()): + ts = time.time() + img_data = np.array( + map(lambda x: x[0].reshape(data_shape), data)).astype( + "float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + loss, acc = exe.run(trainer_prog, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[avg_cost] + accuracy.metrics) + iters += 1 + num_samples += len(data) + print( + "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" + % (pass_id, iters, loss, acc, time.time() - ts) + ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time pass_train_acc = accuracy.eval(exe) @@ -211,6 +213,7 @@ def main(): pserver_endpoints = ",".join(eplist) print("pserver endpoints: ", pserver_endpoints) trainers = int(os.getenv("TRAINERS")) # total trainer count + print("trainers total: ", trainers) current_endpoint = os.getenv( "POD_IP") + ":6174" # current pserver endpoint training_role = os.getenv( diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py index 284dbec48dc..81ddeb03323 100644 --- a/benchmark/cluster/vgg16/vgg16_v2.py +++ b/benchmark/cluster/vgg16/vgg16_v2.py @@ -26,6 +26,7 @@ if BATCH_SIZE: BATCH_SIZE = int(BATCH_SIZE) else: BATCH_SIZE = 128 +print "batch_size", BATCH_SIZE NODE_COUNT = int(os.getenv("TRAINERS")) ts = 0 -- GitLab