diff --git a/benchmark/cluster/vgg16/fluid/Dockerfile b/benchmark/cluster/vgg16/Dockerfile similarity index 91% rename from benchmark/cluster/vgg16/fluid/Dockerfile rename to benchmark/cluster/vgg16/Dockerfile index 711076b09e316292007acc40bedc1987d06c0065..dfaffb8c213f9ab6dac1f7e0f8fd6f7ebc360739 100644 --- a/benchmark/cluster/vgg16/fluid/Dockerfile +++ b/benchmark/cluster/vgg16/Dockerfile @@ -12,4 +12,4 @@ ENV LD_LIBRARY_PATH=/usr/local/lib ADD reader.py /workspace/ RUN python /workspace/reader.py -ADD vgg16.py /workspace/ +ADD vgg16_fluid.py vgg16_v2.py /workspace/ diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md new file mode 100644 index 0000000000000000000000000000000000000000..18128e52761715e4380d72c89bf53f7346f930ec --- /dev/null +++ b/benchmark/cluster/vgg16/README.md @@ -0,0 +1,58 @@ +# Performance for distributed vgg16 + +## Test Result + +### Single node single thread + +| Batch Size | 32 | 64 | 128 | 256 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | - | - | 16.74 | - | +| PaddlePaddle v2 | - | - | 17.60 | - | +| TensorFlow | - | - | - | - | + +### different batch size + +- PServer Count: 10 +- Trainer Count: 20 +- Metrics: samples / sec + +| Batch Size | 32 | 64 | 128 | 256 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | - | 247.40 | - | - | +| PaddlePaddle v2 | - | - | 256.14 | - | +| TensorFlow | - | - | - | - | + +### different pserver number + +- Trainer Count: 100 +- Batch Size: 64 +- Metrics: mini-batch / sec + +| PServer Count | 10 | 20 | 40 | 60 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | - | - | - | - | +| PaddlePaddle v2 | - | - | - | - | +| TensorFlow | - | - | - | - | + +### Accelerate rate + +| Trainer Counter | 20 | 40 | 80 | 100 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | - | - | - | - | +| PaddlePaddle v2 | - | - | - | - | +| TensorFlow | - | - | - | - | + + +## Steps to run the performance test + +1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support. +1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory. +1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it. +1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step). +1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers. + +Check the logs for the distributed training progress and analyze the performance. + +## Enable verbos logs + +Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail. diff --git a/benchmark/cluster/vgg16/fluid/README.md b/benchmark/cluster/vgg16/fluid/README.md deleted file mode 100644 index 71a3a934d20b0328ec41dbc34ca3b384749ca49a..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/fluid/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# Fluid distributed training perf test - -## Steps to get started - -1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support. -1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory. -1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it. -1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step). -1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers. - -Check the logs for the distributed training progress and analyze the performance. - -## Enable verbos logs - -Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail. diff --git a/benchmark/cluster/vgg16/fluid/pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml similarity index 89% rename from benchmark/cluster/vgg16/fluid/pserver.yaml rename to benchmark/cluster/vgg16/fluid_pserver.yaml index e1a58260af0325a313934cfa3730801190cadcce..ee8b0763b62fc011f40f6197e929a68b48a93e47 100644 --- a/benchmark/cluster/vgg16/fluid/pserver.yaml +++ b/benchmark/cluster/vgg16/fluid_pserver.yaml @@ -14,7 +14,7 @@ spec: - name: job-registry-secret containers: - name: pserver - image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid" + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" imagePullPolicy: Always ports: - name: jobport-30236 @@ -33,7 +33,7 @@ spec: - name: TOPOLOGY value: "" - name: ENTRY - value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0" + value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0" - name: TRAINER_PACKAGE value: "/workspace" - name: PADDLE_INIT_PORT @@ -53,7 +53,7 @@ spec: - name: PADDLE_INIT_USE_GPU value: "0" - name: LD_LIBRARY_PATH - value: "/usr/local/nvidia/lib64" + value: "/usr/local/lib:/usr/local/nvidia/lib64" - name: NAMESPACE valueFrom: fieldRef: diff --git a/benchmark/cluster/vgg16/fluid/trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml similarity index 87% rename from benchmark/cluster/vgg16/fluid/trainer.yaml rename to benchmark/cluster/vgg16/fluid_trainer.yaml index c8e26d4b511f4f659fc08229cb463bd77a6f724b..0a0ed25ebe43c4cc0d5ab0b72cf36c936fcce802 100644 --- a/benchmark/cluster/vgg16/fluid/trainer.yaml +++ b/benchmark/cluster/vgg16/fluid_trainer.yaml @@ -15,7 +15,7 @@ spec: hostNetwork: true containers: - name: trainer - image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid" + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" imagePullPolicy: Always command: ["paddle_k8s", "start_fluid"] env: @@ -30,7 +30,7 @@ spec: - name: TOPOLOGY value: "" - name: ENTRY - value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0" + value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128" - name: TRAINER_PACKAGE value: "/workspace" - name: PADDLE_INIT_PORT @@ -50,7 +50,7 @@ spec: - name: PADDLE_INIT_USE_GPU value: "0" - name: LD_LIBRARY_PATH - value: "/usr/local/nvidia/lib64" + value: "/usr/local/lib:/usr/local/nvidia/lib64" - name: NAMESPACE valueFrom: fieldRef: diff --git a/benchmark/cluster/vgg16/fluid/k8s_tools.py b/benchmark/cluster/vgg16/k8s_tools.py similarity index 100% rename from benchmark/cluster/vgg16/fluid/k8s_tools.py rename to benchmark/cluster/vgg16/k8s_tools.py diff --git a/benchmark/cluster/vgg16/fluid/paddle_k8s b/benchmark/cluster/vgg16/paddle_k8s similarity index 100% rename from benchmark/cluster/vgg16/fluid/paddle_k8s rename to benchmark/cluster/vgg16/paddle_k8s diff --git a/benchmark/cluster/vgg16/fluid/reader.py b/benchmark/cluster/vgg16/reader.py similarity index 100% rename from benchmark/cluster/vgg16/fluid/reader.py rename to benchmark/cluster/vgg16/reader.py diff --git a/benchmark/cluster/vgg16/v2/Dockerfile b/benchmark/cluster/vgg16/v2/Dockerfile deleted file mode 100644 index 5f129a8e323a72bd1f9e1ca9a2046ee2149f3a2c..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/v2/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM paddlepaddle/paddlecloud-job -RUN mkdir -p /workspace -ADD reader.py /workspace/ -RUN python /workspace/reader.py -ADD vgg16.py /workspace/ - -ADD vgg16_fluid.py /workspace diff --git a/benchmark/cluster/vgg16/v2/reader.py b/benchmark/cluster/vgg16/v2/reader.py deleted file mode 100644 index 16ac2dbcef4b758a2bf7a057a4a99e4ce7e136cb..0000000000000000000000000000000000000000 --- a/benchmark/cluster/vgg16/v2/reader.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. - -import random -from paddle.v2.image import load_and_transform -import paddle.v2 as paddle -from multiprocessing import cpu_count - - -def train_mapper(sample): - ''' - map image path to type needed by model input layer for the training set - ''' - img, label = sample - img = paddle.image.load_image(img) - img = paddle.image.simple_transform(img, 256, 224, True) - return img.flatten().astype('float32'), label - - -def test_mapper(sample): - ''' - map image path to type needed by model input layer for the test set - ''' - img, label = sample - img = paddle.image.load_image(img) - img = paddle.image.simple_transform(img, 256, 224, True) - return img.flatten().astype('float32'), label - - -def train_reader(train_list, buffered_size=1024): - def reader(): - with open(train_list, 'r') as f: - lines = [line.strip() for line in f] - for line in lines: - img_path, lab = line.strip().split('\t') - yield img_path, int(lab) - - return paddle.reader.xmap_readers(train_mapper, reader, - cpu_count(), buffered_size) - - -def test_reader(test_list, buffered_size=1024): - def reader(): - with open(test_list, 'r') as f: - lines = [line.strip() for line in f] - for line in lines: - img_path, lab = line.strip().split('\t') - yield img_path, int(lab) - - return paddle.reader.xmap_readers(test_mapper, reader, - cpu_count(), buffered_size) - - -if __name__ == '__main__': - #for im in train_reader('train.list'): - # print len(im[0]) - #for im in train_reader('test.list'): - # print len(im[0]) - paddle.dataset.cifar.train10() diff --git a/benchmark/cluster/vgg16/v2/pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml similarity index 92% rename from benchmark/cluster/vgg16/v2/pserver.yaml rename to benchmark/cluster/vgg16/v2_pserver.yaml index 943675e147212ebf9b2007b9f914bdc8d6d2ba4e..dd1271e0cf399184134c06b3200ee1202c65cef0 100644 --- a/benchmark/cluster/vgg16/v2/pserver.yaml +++ b/benchmark/cluster/vgg16/v2_pserver.yaml @@ -14,7 +14,7 @@ spec: - name: job-registry-secret containers: - name: pserver - image: "registry.baidu.com/paddlepaddle/rawjob:vgg16" + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" imagePullPolicy: Always ports: - name: jobport-30236 @@ -49,7 +49,7 @@ spec: - name: PADDLE_INIT_USE_GPU value: "0" - name: LD_LIBRARY_PATH - value: "/usr/local/nvidia/lib64" + value: "/usr/local/lib:/usr/local/nvidia/lib64" - name: NAMESPACE valueFrom: fieldRef: diff --git a/benchmark/cluster/vgg16/v2/trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml similarity index 86% rename from benchmark/cluster/vgg16/v2/trainer.yaml rename to benchmark/cluster/vgg16/v2_trainer.yaml index 200b6dc304a5d73c80ca54f5d45e987d2e893d68..9d52e231f0e7e1804e515fb7f0de60e75635ae8b 100644 --- a/benchmark/cluster/vgg16/v2/trainer.yaml +++ b/benchmark/cluster/vgg16/v2_trainer.yaml @@ -15,12 +15,14 @@ spec: hostNetwork: true containers: - name: trainer - image: "registry.baidu.com/paddlepaddle/rawjob:vgg16" + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" imagePullPolicy: Always command: ["paddle_k8s", "start_trainer", "v2"] env: - name: PADDLE_JOB_NAME value: vgg16v2job + - name: BATCH_SIZE + value: "128" - name: TRAINERS value: "20" - name: PSERVERS @@ -28,7 +30,7 @@ spec: - name: TOPOLOGY value: "" - name: ENTRY - value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py" + value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py" - name: TRAINER_PACKAGE value: "/workspace" - name: PADDLE_INIT_PORT @@ -36,7 +38,7 @@ spec: - name: PADDLE_INIT_NICS value: "xgbe0" - name: PADDLE_INIT_TRAINER_COUNT - value: "1" + value: "2" - name: PADDLE_INIT_PORTS_NUM value: "1" - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE @@ -44,11 +46,11 @@ spec: - name: PADDLE_INIT_NUM_GRADIENT_SERVERS value: "20" - name: PADDLE_INIT_NUM_PASSES - value: "1" + value: "2" - name: PADDLE_INIT_USE_GPU value: "0" - name: LD_LIBRARY_PATH - value: "/usr/local/nvidia/lib64" + value: "/usr/local/lib:/usr/local/nvidia/lib64" - name: NAMESPACE valueFrom: fieldRef: diff --git a/benchmark/cluster/vgg16/fluid/vgg16.py b/benchmark/cluster/vgg16/vgg16_fluid.py similarity index 100% rename from benchmark/cluster/vgg16/fluid/vgg16.py rename to benchmark/cluster/vgg16/vgg16_fluid.py diff --git a/benchmark/cluster/vgg16/v2/vgg16.py b/benchmark/cluster/vgg16/vgg16_v2.py similarity index 89% rename from benchmark/cluster/vgg16/v2/vgg16.py rename to benchmark/cluster/vgg16/vgg16_v2.py index 59e3997d7848cc8ed14a390f9c478feb4e837db8..284dbec48dcb794f947a4a9c4af7949697cac8e9 100644 --- a/benchmark/cluster/vgg16/v2/vgg16.py +++ b/benchmark/cluster/vgg16/vgg16_v2.py @@ -16,12 +16,17 @@ import gzip import paddle.v2.dataset.cifar as cifar import paddle.v2 as paddle -import reader import time +import os DATA_DIM = 3 * 32 * 32 CLASS_DIM = 10 -BATCH_SIZE = 128 +BATCH_SIZE = os.getenv("BATCH_SIZE") +if BATCH_SIZE: + BATCH_SIZE = int(BATCH_SIZE) +else: + BATCH_SIZE = 128 +NODE_COUNT = int(os.getenv("TRAINERS")) ts = 0 @@ -77,14 +82,15 @@ def vgg19(input, class_dim): def main(): global ts - paddle.init(use_gpu=False, trainer_count=1) + paddle.init(use_gpu=False) image = paddle.layer.data( name="image", type=paddle.data_type.dense_vector(DATA_DIM)) lbl = paddle.layer.data( name="label", type=paddle.data_type.integer_value(CLASS_DIM)) extra_layers = None - learning_rate = 0.01 + # NOTE: for v2 distributed training need averaging updates. + learning_rate = 1e-3 / NODE_COUNT out = vgg16(image, class_dim=CLASS_DIM) cost = paddle.layer.classification_cost(input=out, label=lbl) @@ -123,7 +129,9 @@ def main(): # End batch and end pass event handler def event_handler(event): - global ts + global ts, ts_pass + if isinstance(event, paddle.event.BeginPass): + ts_pass = time.time() if isinstance(event, paddle.event.BeginIteration): ts = time.time() if isinstance(event, paddle.event.EndIteration): @@ -132,9 +140,8 @@ def main(): event.pass_id, event.batch_id, event.cost, event.metrics, time.time() - ts) if isinstance(event, paddle.event.EndPass): - with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f: - trainer.save_parameter_to_tar(f) - + print "Pass %d end, spent: %f" % (event.pass_id, + time.time() - ts_pass) result = trainer.test(reader=test_reader) print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)