提交 7aed1c13 编写于 作者: T typhoonzero

Merge branch 'dist_train_benchmark_vgg16' of...

Merge branch 'dist_train_benchmark_vgg16' of https://github.com/typhoonzero/Paddle into dist_train_benchmark_vgg16
...@@ -12,4 +12,4 @@ ENV LD_LIBRARY_PATH=/usr/local/lib ...@@ -12,4 +12,4 @@ ENV LD_LIBRARY_PATH=/usr/local/lib
ADD reader.py /workspace/ ADD reader.py /workspace/
RUN python /workspace/reader.py RUN python /workspace/reader.py
ADD vgg16.py /workspace/ ADD vgg16_fluid.py vgg16_v2.py /workspace/
# Fluid distributed training perf test # Performance for distributed vgg16
## Steps to get started ## Test Result
### Single node single thread
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | 16.74 | - |
| PaddlePaddle v2 | - | - | 17.60 | - |
| TensorFlow | - | - | - | - |
### different batch size
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | 247.40 | - | - |
| PaddlePaddle v2 | - | - | 256.14 | - |
| TensorFlow | - | - | - | - |
### different pserver number
- Trainer Count: 100
- Batch Size: 64
- Metrics: mini-batch / sec
| PServer Count | 10 | 20 | 40 | 60 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | - | - |
| PaddlePaddle v2 | - | - | - | - |
| TensorFlow | - | - | - | - |
### Accelerate rate
| Trainer Counter | 20 | 40 | 80 | 100 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | - | - |
| PaddlePaddle v2 | - | - | - | - |
| TensorFlow | - | - | - | - |
## Steps to run the performance test
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support. 1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory. 1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
......
...@@ -14,7 +14,7 @@ spec: ...@@ -14,7 +14,7 @@ spec:
- name: job-registry-secret - name: job-registry-secret
containers: containers:
- name: pserver - name: pserver
image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid" image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always imagePullPolicy: Always
ports: ports:
- name: jobport-30236 - name: jobport-30236
...@@ -33,7 +33,7 @@ spec: ...@@ -33,7 +33,7 @@ spec:
- name: TOPOLOGY - name: TOPOLOGY
value: "" value: ""
- name: ENTRY - name: ENTRY
value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0" value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- name: TRAINER_PACKAGE - name: TRAINER_PACKAGE
value: "/workspace" value: "/workspace"
- name: PADDLE_INIT_PORT - name: PADDLE_INIT_PORT
...@@ -53,7 +53,7 @@ spec: ...@@ -53,7 +53,7 @@ spec:
- name: PADDLE_INIT_USE_GPU - name: PADDLE_INIT_USE_GPU
value: "0" value: "0"
- name: LD_LIBRARY_PATH - name: LD_LIBRARY_PATH
value: "/usr/local/nvidia/lib64" value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE - name: NAMESPACE
valueFrom: valueFrom:
fieldRef: fieldRef:
......
...@@ -15,7 +15,7 @@ spec: ...@@ -15,7 +15,7 @@ spec:
hostNetwork: true hostNetwork: true
containers: containers:
- name: trainer - name: trainer
image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid" image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"] command: ["paddle_k8s", "start_fluid"]
env: env:
...@@ -30,7 +30,7 @@ spec: ...@@ -30,7 +30,7 @@ spec:
- name: TOPOLOGY - name: TOPOLOGY
value: "" value: ""
- name: ENTRY - name: ENTRY
value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0" value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- name: TRAINER_PACKAGE - name: TRAINER_PACKAGE
value: "/workspace" value: "/workspace"
- name: PADDLE_INIT_PORT - name: PADDLE_INIT_PORT
...@@ -50,7 +50,7 @@ spec: ...@@ -50,7 +50,7 @@ spec:
- name: PADDLE_INIT_USE_GPU - name: PADDLE_INIT_USE_GPU
value: "0" value: "0"
- name: LD_LIBRARY_PATH - name: LD_LIBRARY_PATH
value: "/usr/local/nvidia/lib64" value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE - name: NAMESPACE
valueFrom: valueFrom:
fieldRef: fieldRef:
......
FROM paddlepaddle/paddlecloud-job
RUN mkdir -p /workspace
ADD reader.py /workspace/
RUN python /workspace/reader.py
ADD vgg16.py /workspace/
ADD vgg16_fluid.py /workspace
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import random
from paddle.v2.image import load_and_transform
import paddle.v2 as paddle
from multiprocessing import cpu_count
def train_mapper(sample):
'''
map image path to type needed by model input layer for the training set
'''
img, label = sample
img = paddle.image.load_image(img)
img = paddle.image.simple_transform(img, 256, 224, True)
return img.flatten().astype('float32'), label
def test_mapper(sample):
'''
map image path to type needed by model input layer for the test set
'''
img, label = sample
img = paddle.image.load_image(img)
img = paddle.image.simple_transform(img, 256, 224, True)
return img.flatten().astype('float32'), label
def train_reader(train_list, buffered_size=1024):
def reader():
with open(train_list, 'r') as f:
lines = [line.strip() for line in f]
for line in lines:
img_path, lab = line.strip().split('\t')
yield img_path, int(lab)
return paddle.reader.xmap_readers(train_mapper, reader,
cpu_count(), buffered_size)
def test_reader(test_list, buffered_size=1024):
def reader():
with open(test_list, 'r') as f:
lines = [line.strip() for line in f]
for line in lines:
img_path, lab = line.strip().split('\t')
yield img_path, int(lab)
return paddle.reader.xmap_readers(test_mapper, reader,
cpu_count(), buffered_size)
if __name__ == '__main__':
#for im in train_reader('train.list'):
# print len(im[0])
#for im in train_reader('test.list'):
# print len(im[0])
paddle.dataset.cifar.train10()
...@@ -14,7 +14,7 @@ spec: ...@@ -14,7 +14,7 @@ spec:
- name: job-registry-secret - name: job-registry-secret
containers: containers:
- name: pserver - name: pserver
image: "registry.baidu.com/paddlepaddle/rawjob:vgg16" image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always imagePullPolicy: Always
ports: ports:
- name: jobport-30236 - name: jobport-30236
...@@ -49,7 +49,7 @@ spec: ...@@ -49,7 +49,7 @@ spec:
- name: PADDLE_INIT_USE_GPU - name: PADDLE_INIT_USE_GPU
value: "0" value: "0"
- name: LD_LIBRARY_PATH - name: LD_LIBRARY_PATH
value: "/usr/local/nvidia/lib64" value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE - name: NAMESPACE
valueFrom: valueFrom:
fieldRef: fieldRef:
......
...@@ -15,12 +15,14 @@ spec: ...@@ -15,12 +15,14 @@ spec:
hostNetwork: true hostNetwork: true
containers: containers:
- name: trainer - name: trainer
image: "registry.baidu.com/paddlepaddle/rawjob:vgg16" image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"] command: ["paddle_k8s", "start_trainer", "v2"]
env: env:
- name: PADDLE_JOB_NAME - name: PADDLE_JOB_NAME
value: vgg16v2job value: vgg16v2job
- name: BATCH_SIZE
value: "128"
- name: TRAINERS - name: TRAINERS
value: "20" value: "20"
- name: PSERVERS - name: PSERVERS
...@@ -28,7 +30,7 @@ spec: ...@@ -28,7 +30,7 @@ spec:
- name: TOPOLOGY - name: TOPOLOGY
value: "" value: ""
- name: ENTRY - name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py" value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- name: TRAINER_PACKAGE - name: TRAINER_PACKAGE
value: "/workspace" value: "/workspace"
- name: PADDLE_INIT_PORT - name: PADDLE_INIT_PORT
...@@ -36,7 +38,7 @@ spec: ...@@ -36,7 +38,7 @@ spec:
- name: PADDLE_INIT_NICS - name: PADDLE_INIT_NICS
value: "xgbe0" value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT - name: PADDLE_INIT_TRAINER_COUNT
value: "1" value: "2"
- name: PADDLE_INIT_PORTS_NUM - name: PADDLE_INIT_PORTS_NUM
value: "1" value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
...@@ -44,11 +46,11 @@ spec: ...@@ -44,11 +46,11 @@ spec:
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20" value: "20"
- name: PADDLE_INIT_NUM_PASSES - name: PADDLE_INIT_NUM_PASSES
value: "1" value: "2"
- name: PADDLE_INIT_USE_GPU - name: PADDLE_INIT_USE_GPU
value: "0" value: "0"
- name: LD_LIBRARY_PATH - name: LD_LIBRARY_PATH
value: "/usr/local/nvidia/lib64" value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE - name: NAMESPACE
valueFrom: valueFrom:
fieldRef: fieldRef:
......
...@@ -16,12 +16,17 @@ import gzip ...@@ -16,12 +16,17 @@ import gzip
import paddle.v2.dataset.cifar as cifar import paddle.v2.dataset.cifar as cifar
import paddle.v2 as paddle import paddle.v2 as paddle
import reader
import time import time
import os
DATA_DIM = 3 * 32 * 32 DATA_DIM = 3 * 32 * 32
CLASS_DIM = 10 CLASS_DIM = 10
BATCH_SIZE = 128 BATCH_SIZE = os.getenv("BATCH_SIZE")
if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE)
else:
BATCH_SIZE = 128
NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0 ts = 0
...@@ -77,14 +82,15 @@ def vgg19(input, class_dim): ...@@ -77,14 +82,15 @@ def vgg19(input, class_dim):
def main(): def main():
global ts global ts
paddle.init(use_gpu=False, trainer_count=1) paddle.init(use_gpu=False)
image = paddle.layer.data( image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(DATA_DIM)) name="image", type=paddle.data_type.dense_vector(DATA_DIM))
lbl = paddle.layer.data( lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(CLASS_DIM)) name="label", type=paddle.data_type.integer_value(CLASS_DIM))
extra_layers = None extra_layers = None
learning_rate = 0.01 # NOTE: for v2 distributed training need averaging updates.
learning_rate = 1e-3 / NODE_COUNT
out = vgg16(image, class_dim=CLASS_DIM) out = vgg16(image, class_dim=CLASS_DIM)
cost = paddle.layer.classification_cost(input=out, label=lbl) cost = paddle.layer.classification_cost(input=out, label=lbl)
...@@ -123,7 +129,9 @@ def main(): ...@@ -123,7 +129,9 @@ def main():
# End batch and end pass event handler # End batch and end pass event handler
def event_handler(event): def event_handler(event):
global ts global ts, ts_pass
if isinstance(event, paddle.event.BeginPass):
ts_pass = time.time()
if isinstance(event, paddle.event.BeginIteration): if isinstance(event, paddle.event.BeginIteration):
ts = time.time() ts = time.time()
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
...@@ -132,9 +140,8 @@ def main(): ...@@ -132,9 +140,8 @@ def main():
event.pass_id, event.batch_id, event.cost, event.metrics, event.pass_id, event.batch_id, event.cost, event.metrics,
time.time() - ts) time.time() - ts)
if isinstance(event, paddle.event.EndPass): if isinstance(event, paddle.event.EndPass):
with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f: print "Pass %d end, spent: %f" % (event.pass_id,
trainer.save_parameter_to_tar(f) time.time() - ts_pass)
result = trainer.test(reader=test_reader) result = trainer.test(reader=test_reader)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册