提交 7aed1c13 编写于 作者: T typhoonzero

Merge branch 'dist_train_benchmark_vgg16' of...

Merge branch 'dist_train_benchmark_vgg16' of https://github.com/typhoonzero/Paddle into dist_train_benchmark_vgg16
......@@ -12,4 +12,4 @@ ENV LD_LIBRARY_PATH=/usr/local/lib
ADD reader.py /workspace/
RUN python /workspace/reader.py
ADD vgg16.py /workspace/
ADD vgg16_fluid.py vgg16_v2.py /workspace/
# Fluid distributed training perf test
# Performance for distributed vgg16
## Steps to get started
## Test Result
### Single node single thread
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | 16.74 | - |
| PaddlePaddle v2 | - | - | 17.60 | - |
| TensorFlow | - | - | - | - |
### different batch size
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | 247.40 | - | - |
| PaddlePaddle v2 | - | - | 256.14 | - |
| TensorFlow | - | - | - | - |
### different pserver number
- Trainer Count: 100
- Batch Size: 64
- Metrics: mini-batch / sec
| PServer Count | 10 | 20 | 40 | 60 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | - | - |
| PaddlePaddle v2 | - | - | - | - |
| TensorFlow | - | - | - | - |
### Accelerate rate
| Trainer Counter | 20 | 40 | 80 | 100 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | - | - |
| PaddlePaddle v2 | - | - | - | - |
| TensorFlow | - | - | - | - |
## Steps to run the performance test
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
......
......@@ -14,7 +14,7 @@ spec:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
......@@ -33,7 +33,7 @@ spec:
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
......@@ -53,7 +53,7 @@ spec:
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/nvidia/lib64"
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
......
......@@ -15,7 +15,7 @@ spec:
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"]
env:
......@@ -30,7 +30,7 @@ spec:
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
......@@ -50,7 +50,7 @@ spec:
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/nvidia/lib64"
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
......
FROM paddlepaddle/paddlecloud-job
RUN mkdir -p /workspace
ADD reader.py /workspace/
RUN python /workspace/reader.py
ADD vgg16.py /workspace/
ADD vgg16_fluid.py /workspace
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import random
from paddle.v2.image import load_and_transform
import paddle.v2 as paddle
from multiprocessing import cpu_count
def train_mapper(sample):
'''
map image path to type needed by model input layer for the training set
'''
img, label = sample
img = paddle.image.load_image(img)
img = paddle.image.simple_transform(img, 256, 224, True)
return img.flatten().astype('float32'), label
def test_mapper(sample):
'''
map image path to type needed by model input layer for the test set
'''
img, label = sample
img = paddle.image.load_image(img)
img = paddle.image.simple_transform(img, 256, 224, True)
return img.flatten().astype('float32'), label
def train_reader(train_list, buffered_size=1024):
def reader():
with open(train_list, 'r') as f:
lines = [line.strip() for line in f]
for line in lines:
img_path, lab = line.strip().split('\t')
yield img_path, int(lab)
return paddle.reader.xmap_readers(train_mapper, reader,
cpu_count(), buffered_size)
def test_reader(test_list, buffered_size=1024):
def reader():
with open(test_list, 'r') as f:
lines = [line.strip() for line in f]
for line in lines:
img_path, lab = line.strip().split('\t')
yield img_path, int(lab)
return paddle.reader.xmap_readers(test_mapper, reader,
cpu_count(), buffered_size)
if __name__ == '__main__':
#for im in train_reader('train.list'):
# print len(im[0])
#for im in train_reader('test.list'):
# print len(im[0])
paddle.dataset.cifar.train10()
......@@ -14,7 +14,7 @@ spec:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
......@@ -49,7 +49,7 @@ spec:
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/nvidia/lib64"
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
......
......@@ -15,12 +15,14 @@ spec:
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"]
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: BATCH_SIZE
value: "128"
- name: TRAINERS
value: "20"
- name: PSERVERS
......@@ -28,7 +30,7 @@ spec:
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
......@@ -36,7 +38,7 @@ spec:
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
value: "2"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
......@@ -44,11 +46,11 @@ spec:
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
value: "2"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/nvidia/lib64"
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
......
......@@ -16,12 +16,17 @@ import gzip
import paddle.v2.dataset.cifar as cifar
import paddle.v2 as paddle
import reader
import time
import os
DATA_DIM = 3 * 32 * 32
CLASS_DIM = 10
BATCH_SIZE = 128
BATCH_SIZE = os.getenv("BATCH_SIZE")
if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE)
else:
BATCH_SIZE = 128
NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0
......@@ -77,14 +82,15 @@ def vgg19(input, class_dim):
def main():
global ts
paddle.init(use_gpu=False, trainer_count=1)
paddle.init(use_gpu=False)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(DATA_DIM))
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(CLASS_DIM))
extra_layers = None
learning_rate = 0.01
# NOTE: for v2 distributed training need averaging updates.
learning_rate = 1e-3 / NODE_COUNT
out = vgg16(image, class_dim=CLASS_DIM)
cost = paddle.layer.classification_cost(input=out, label=lbl)
......@@ -123,7 +129,9 @@ def main():
# End batch and end pass event handler
def event_handler(event):
global ts
global ts, ts_pass
if isinstance(event, paddle.event.BeginPass):
ts_pass = time.time()
if isinstance(event, paddle.event.BeginIteration):
ts = time.time()
if isinstance(event, paddle.event.EndIteration):
......@@ -132,9 +140,8 @@ def main():
event.pass_id, event.batch_id, event.cost, event.metrics,
time.time() - ts)
if isinstance(event, paddle.event.EndPass):
with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
trainer.save_parameter_to_tar(f)
print "Pass %d end, spent: %f" % (event.pass_id,
time.time() - ts_pass)
result = trainer.test(reader=test_reader)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册