提交 bd64719a 编写于 作者: T typhoonzero

update for today

上级 7aed1c13
...@@ -2,41 +2,57 @@ ...@@ -2,41 +2,57 @@
## Test Result ## Test Result
### Single node single thread ### Hardware Infomation
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
- cpu MHz : 2101.000
- cache size : 20480 KB
### Single Node Single Thread
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 | | Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- | | -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | 16.74 | - | | PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
| PaddlePaddle v2 | - | - | 17.60 | - | | PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
| TensorFlow | - | - | - | - | | TensorFlow | - | - | - | - |
### different batch size ### different batch size
- PServer Count: 10 - PServer Count: 10
- Trainer Count: 20 - Trainer Count: 20
- Per trainer CPU Core: 1
- Metrics: samples / sec - Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 | | Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- | | -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | 247.40 | - | - | | PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
| PaddlePaddle v2 | - | - | 256.14 | - | | PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
| TensorFlow | - | - | - | - | | TensorFlow | - | - | - | - |
### different pserver number
- Trainer Count: 100 ### Accelerate rate
- Batch Size: 64
- Metrics: mini-batch / sec
| PServer Count | 10 | 20 | 40 | 60 | - Pserver Count: 20
- Batch Size: 128
- Metrics: samples / sec
| Trainer Counter | 20 | 40 | 80 | 100 |
| -- | -- | -- | -- | -- | | -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | - | - | | PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 |
| PaddlePaddle v2 | - | - | - | - | | PaddlePaddle v2 | 356.28 | - | - | 1041.99 |
| TensorFlow | - | - | - | - | | TensorFlow | - | - | - | - |
### Accelerate rate ### different pserver number
| Trainer Counter | 20 | 40 | 80 | 100 | - Trainer Count: 100
- Batch Size: 128
- Metrics: mini-batch / sec
| PServer Count | 10 | 20 | 40 | 60 |
| -- | -- | -- | -- | -- | | -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | - | - | | PaddlePaddle Fluid | - | - | - | - |
| PaddlePaddle v2 | - | - | - | - | | PaddlePaddle v2 | - | - | - | - |
......
...@@ -30,7 +30,7 @@ spec: ...@@ -30,7 +30,7 @@ spec:
- name: TOPOLOGY - name: TOPOLOGY
value: "" value: ""
- name: ENTRY - name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128" value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 256"
- name: TRAINER_PACKAGE - name: TRAINER_PACKAGE
value: "/workspace" value: "/workspace"
- name: PADDLE_INIT_PORT - name: PADDLE_INIT_PORT
......
...@@ -22,7 +22,7 @@ spec: ...@@ -22,7 +22,7 @@ spec:
- name: PADDLE_JOB_NAME - name: PADDLE_JOB_NAME
value: vgg16v2job value: vgg16v2job
- name: BATCH_SIZE - name: BATCH_SIZE
value: "128" value: "256"
- name: TRAINERS - name: TRAINERS
value: "20" value: "20"
- name: PSERVERS - name: PSERVERS
......
...@@ -20,6 +20,7 @@ import numpy as np ...@@ -20,6 +20,7 @@ import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import paddle.v2.fluid.profiler as profiler
import argparse import argparse
import functools import functools
import os import os
...@@ -160,24 +161,25 @@ def main(): ...@@ -160,24 +161,25 @@ def main():
start_time = time.time() start_time = time.time()
num_samples = 0 num_samples = 0
accuracy.reset(exe) accuracy.reset(exe)
for batch_id, data in enumerate(train_reader()): with profiler.profiler("CPU", 'total') as prof:
ts = time.time() for batch_id, data in enumerate(train_reader()):
img_data = np.array( ts = time.time()
map(lambda x: x[0].reshape(data_shape), data)).astype( img_data = np.array(
"float32") map(lambda x: x[0].reshape(data_shape), data)).astype(
y_data = np.array(map(lambda x: x[1], data)).astype("int64") "float32")
y_data = y_data.reshape([-1, 1]) y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
loss, acc = exe.run(trainer_prog,
feed={"pixel": img_data, loss, acc = exe.run(trainer_prog,
"label": y_data}, feed={"pixel": img_data,
fetch_list=[avg_cost] + accuracy.metrics) "label": y_data},
iters += 1 fetch_list=[avg_cost] + accuracy.metrics)
num_samples += len(data) iters += 1
print( num_samples += len(data)
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" print(
% (pass_id, iters, loss, acc, time.time() - ts) "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
) # The accuracy is the accumulation of batches, but not the current batch. % (pass_id, iters, loss, acc, time.time() - ts)
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time pass_elapsed = time.time() - start_time
pass_train_acc = accuracy.eval(exe) pass_train_acc = accuracy.eval(exe)
...@@ -211,6 +213,7 @@ def main(): ...@@ -211,6 +213,7 @@ def main():
pserver_endpoints = ",".join(eplist) pserver_endpoints = ",".join(eplist)
print("pserver endpoints: ", pserver_endpoints) print("pserver endpoints: ", pserver_endpoints)
trainers = int(os.getenv("TRAINERS")) # total trainer count trainers = int(os.getenv("TRAINERS")) # total trainer count
print("trainers total: ", trainers)
current_endpoint = os.getenv( current_endpoint = os.getenv(
"POD_IP") + ":6174" # current pserver endpoint "POD_IP") + ":6174" # current pserver endpoint
training_role = os.getenv( training_role = os.getenv(
......
...@@ -26,6 +26,7 @@ if BATCH_SIZE: ...@@ -26,6 +26,7 @@ if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE) BATCH_SIZE = int(BATCH_SIZE)
else: else:
BATCH_SIZE = 128 BATCH_SIZE = 128
print "batch_size", BATCH_SIZE
NODE_COUNT = int(os.getenv("TRAINERS")) NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0 ts = 0
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册