From 9d377f0996e671f649d9e1fdd40ce327712a5e46 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Wed, 23 Nov 2016 19:18:31 +0800 Subject: [PATCH] Update doc and merge run_multi.sh into run.sh for PaddlePaddle. --- benchmark/README.md | 32 +++++++------- benchmark/caffe/image/run_multi.sh | 2 +- benchmark/paddle/image/run.sh | 13 +++++- benchmark/paddle/image/run_multi.sh | 42 ------------------- benchmark/paddle/rnn/run.sh | 12 ++++++ benchmark/paddle/rnn/run_multi.sh | 34 --------------- .../tensorflow/image/alexnet_multi_gpu.py | 1 - .../tensorflow/image/smallnet_mnist_cifar.py | 1 - 8 files changed, 41 insertions(+), 96 deletions(-) delete mode 100755 benchmark/paddle/image/run_multi.sh delete mode 100755 benchmark/paddle/rnn/run_multi.sh diff --git a/benchmark/README.md b/benchmark/README.md index 8d2cf5737dd..8b453a7b59e 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -7,19 +7,19 @@ Machine: - cuDNN: v5.1 - system: Docker 1.12.1, all platform are tested in docker environment. -Platform: +Platforms: - PaddlePaddle: - Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu -- Caffe: +- Caffe: kaixhin/cuda-caffe -Several convolutional neural networks and recurrent neural network are used to test. +Several convolutional neural networks and recurrent neural networks are used to test. ## Image ### Benchmark Model -AlexNet, GooleNet and a small network which refer the config of cifar10 in Caffe are used. +AlexNet, GoogleNet and a small network used in Caffe. - [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one. @@ -38,9 +38,9 @@ AlexNet, GooleNet and a small network which refer the config of cifar10 in Caffe | TensorFlow | 223 | 364 | 645 | 1235 | | Caffe | 324 | 627 | 1232 | 2513 | -##### Notation +**Notation** -All platforms use cuDnn-v5.1. You might see that caffe is slower, because the workspace limit size is 8 * 1024 * 1024 in Caffe's cuDnn-conv interface. This size is larger in PaddlePaddle and TensorFlow. Caffe will be faster if increasing the workspace limit size. +All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size. - GoogletNet: input - 3 * 224 * 224, Time: ms/batch @@ -59,9 +59,9 @@ All platforms use cuDnn-v5.1. You might see that caffe is slower, because the wo | TensorFlow | 9 | 15 | 28 | 59 | | Caffe | 9.373 | 16.6606 | 31.4797 | 59.719 | -##### Notation +**Notation** -All the tests in caffe use `caffe time` to execute, which is not including the parameter updating process. But the time in PaddlePaddle and TensorFlow contains it. +All the experiments in caffe use `caffe time` to execute, which does not include the time of parameter updating. The time in PaddlePaddle and TensorFlow contains it. But, compared with the total time, the time of parameter updating is relatively little. In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN. @@ -69,13 +69,13 @@ In Tensorflow, they implement algorithm searching method instead of using the al - AlexNet, ms / batch -| totoal-BatchSize | 128 * 4 | 256 * 4 | +| total-BatchSize | 128 * 4 | 256 * 4 | |------------------|----------| -----------| | PaddlePaddle | 347 | 622 | | TensorFlow | 377 | 675 | | Caffe | 1229 | 2435 | -For example, if `totoal-BatchSize = 128 * 4`, the speed is calculated by +For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by ``` time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512 @@ -86,9 +86,9 @@ For example, if `totoal-BatchSize = 128 * 4`, the speed is calculated by -- GooleNet, ms / batch +- GoogleNet, ms / batch -| totoal-BatchSize | 128 * 4 | 256 * 4 | +| total-BatchSize | 128 * 4 | 256 * 4 | |-------------------|--------------| ----------- | | PaddlePaddle | 1178 | 2367 | | TensorFlow | 1210 | 2292 | @@ -102,7 +102,7 @@ We use lstm network for text classfication to test benchmark. ### Dataset - [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl) -- Sequence legth=100, in fact, PaddlePaddle support training with variable-length sequence. But TensorFlow need to pad, in order to compare, we also pad sequence length to 100 in PaddlePaddle. +- Sequence legth is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad, we also pad sequence length to 100 in PaddlePaddle in order to compare. - Dictionary size=30000 - Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow. @@ -110,7 +110,7 @@ We use lstm network for text classfication to test benchmark. #### LSTM in Text Classification -Testing network for different hidden size, batch size with `2 lstm layer + fc` network. +Testing `2 lstm layer + fc` network with different hidden size and batch size. - Batch size = 64, ms / batch @@ -138,7 +138,7 @@ Testing network for different hidden size, batch size with `2 lstm layer + fc` n #### Seq2Seq -The benchmark of sequence-to-sequence network will be add later. +The benchmark of sequence-to-sequence network will be added later. ### Multi GPU: 4 GPUs @@ -165,4 +165,4 @@ The benchmark of sequence-to-sequence network will be add later. #### Seq2Seq -The benchmark of sequence-to-sequence network will be add later. +The benchmark of sequence-to-sequence network will be added later. diff --git a/benchmark/caffe/image/run_multi.sh b/benchmark/caffe/image/run_multi.sh index f72b062c11c..9a0a71bc185 100755 --- a/benchmark/caffe/image/run_multi.sh +++ b/benchmark/caffe/image/run_multi.sh @@ -9,7 +9,7 @@ function test() { sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg sed -i "1c\net : \"${cfg}\"" solver.prototxt - caffe train --solver=solver.prototxt -gpu all > logs/${prefix}-4gpu-batch${batch}.log 2>&1 + caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1 } if [ ! -d "logs" ]; then diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh index 6fccf7854c6..a2169288351 100755 --- a/benchmark/paddle/image/run.sh +++ b/benchmark/paddle/image/run.sh @@ -1,5 +1,8 @@ set -e +# If use `paddle train` to run, it must use DataProvider to +# pass the data type to PaddlePaddle system. +# And PaddlePaddle requires training set list (train.list), function gen_file() { if [ ! -d "train.txt" ]; then for ((i=1;i<=1024;i++)) @@ -26,7 +29,6 @@ function train() { --log_period=10 \ --test_period=100 \ --config_args=$args \ - --cudnn_dir=/home/dangqingqing/tools/cudnn-5.1/lib64 \ > logs/$prefix-${thread}gpu-$bz.log 2>&1 } @@ -52,3 +54,12 @@ train smallnet_mnist_cifar.py 1 64 smallnet train smallnet_mnist_cifar.py 1 128 smallnet train smallnet_mnist_cifar.py 1 256 smallnet train smallnet_mnist_cifar.py 1 512 smallnet + + +############################ +#========multi-gpus=========# +train alexnet.py 4 512 alexnet +train alexnet.py 4 1024 alexnet + +train googlenet.py 4 512 googlenet +train googlenet.py 4 1024 googlenet diff --git a/benchmark/paddle/image/run_multi.sh b/benchmark/paddle/image/run_multi.sh deleted file mode 100755 index c506668fe0b..00000000000 --- a/benchmark/paddle/image/run_multi.sh +++ /dev/null @@ -1,42 +0,0 @@ -set -e - -function gen_file() { - if [ ! -d "train.txt" ]; then - for ((i=1;i<=1024;i++)) - do - echo "train/n09246464/n09246464_38735.jpeg 972" >> train.txt - done - fi - - if [ ! -d "train.list" ]; then - echo "train.txt" > train.list - fi -} - -function train() { - cfg=$1 - thread=$2 - bz=$3 - args="batch_size=$3" - prefix=$4 - paddle train --job=time \ - --config=$cfg \ - --use_gpu=True \ - --trainer_count=$thread \ - --log_period=10 \ - --test_period=100 \ - --config_args=$args \ - > logs/$prefix-${thread}gpu-$bz.log 2>&1 -} - -gen_file -if [ ! -d "logs" ]; then - mkdir logs -fi - -#========multi-gpus=========# -train alexnet.py 4 512 alexnet -train alexnet.py 4 1024 alexnet - -train googlenet.py 4 512 googlenet -train googlenet.py 4 1024 googlenet diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh index 92c6e0b4b42..e9dfeb2e525 100755 --- a/benchmark/paddle/rnn/run.sh +++ b/benchmark/paddle/rnn/run.sh @@ -36,3 +36,15 @@ train rnn.py 1 2 1 1280 128 train rnn.py 1 2 1 256 256 train rnn.py 1 2 1 512 256 train rnn.py 1 2 1 1280 256 + + +#==================multi gpus=====================# +# hidden_size=256, lstm_num=2, different batch size +train rnn.py 4 2 1 256 128 +train rnn.py 4 2 1 256 256 +train rnn.py 4 2 1 256 512 + +# hidden_size=512, lstm_num=4, different batch size +train rnn.py 4 2 1 512 128 +train rnn.py 4 2 1 512 256 +train rnn.py 4 2 1 512 512 diff --git a/benchmark/paddle/rnn/run_multi.sh b/benchmark/paddle/rnn/run_multi.sh deleted file mode 100755 index 50ee469bcd9..00000000000 --- a/benchmark/paddle/rnn/run_multi.sh +++ /dev/null @@ -1,34 +0,0 @@ -set -e - -function train() { - cfg=$1 - thread=$2 - args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}" - paddle train --job=time \ - --config=$cfg \ - --use_gpu=1 \ - --trainer_count=$thread \ - --log_period=10 \ - --test_period=100 \ - --num_passes=1 \ - --feed_data=1 \ - --config_args=$args \ - >logs/rnn-pad${4}-${thread}gpu-lstm${3}-hid${5}-batch${6}.log 2>&1 -} - - -if [ ! -d "logs" ]; then - mkdir logs -fi - -#-----config--gpu--lstm_num--padding--hidden_size--batch_size -#==================multi gpus=====================# -# hidden_size=256, lstm_num=2, different batch size -train rnn.py 4 2 1 256 128 -train rnn.py 4 2 1 256 256 -train rnn.py 4 2 1 256 512 - -# hidden_size=512, lstm_num=4, different batch size -train rnn.py 4 2 1 512 128 -train rnn.py 4 2 1 512 256 -train rnn.py 4 2 1 512 512 diff --git a/benchmark/tensorflow/image/alexnet_multi_gpu.py b/benchmark/tensorflow/image/alexnet_multi_gpu.py index 949ad77f3b8..f006fb56af7 100644 --- a/benchmark/tensorflow/image/alexnet_multi_gpu.py +++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py @@ -279,7 +279,6 @@ def run_benchmark(): staircase=True) # Create an optimizer that performs gradient descent. - # opt = tf.train.GradientDescentOptimizer(lr) opt = tf.train.MomentumOptimizer(lr, 0.9) # Calculate the gradients for each model tower. diff --git a/benchmark/tensorflow/image/smallnet_mnist_cifar.py b/benchmark/tensorflow/image/smallnet_mnist_cifar.py index b539d1bed06..679dd1ab322 100644 --- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py +++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py @@ -222,7 +222,6 @@ def run_benchmark(): objective = loss(last_layer, labels) # Compute gradients. - # opt = tf.train.GradientDescentOptimizer(0.001) opt = tf.train.MomentumOptimizer(0.001, 0.9) grads = opt.compute_gradients(objective) global_step = tf.get_variable('global_step', [], -- GitLab