Update doc and merge run_multi.sh into run.sh for PaddlePaddle.

9d377f09 · dangqingqing · a8342d07 · 9d377f09 · 9d377f09 · 9d377f09
8 changed file
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -7,19 +7,19 @@ Machine:
 - cuDNN: v5.1
 - system: Docker 1.12.1, all platform are tested in docker environment.
-Platform: 
+Platforms: 
 - PaddlePaddle: 
 - Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu 
- Caffe: 
+- Caffe: kaixhin/cuda-caffe
-Several convolutional neural networks and recurrent neural network are used to test.
+Several convolutional neural networks and recurrent neural networks are used to test.
 ## Image
 ### Benchmark Model
-AlexNet, GooleNet and a small network which refer the config of cifar10 in Caffe are used.
+AlexNet, GoogleNet and a small network used in Caffe.
 - [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
@@ -38,9 +38,9 @@ AlexNet, GooleNet and a small network which refer the config of cifar10 in Caffe
 | TensorFlow   | 223 | 364  | 645   | 1235 |
 | Caffe        | 324 | 627  | 1232  | 2513 |
-##### Notation
+**Notation**
-All platforms use cuDnn-v5.1. You might see that caffe is slower, because the workspace limit size is 8 * 1024 * 1024 in Caffe's cuDnn-conv interface. This size is larger in PaddlePaddle and TensorFlow. Caffe will be faster if increasing the workspace limit size.
+All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
 - GoogletNet:  input - 3 * 224 * 224, Time: ms/batch
@@ -59,9 +59,9 @@ All platforms use cuDnn-v5.1. You might see that caffe is slower, because the wo
 | TensorFlow   | 9     | 15       | 28      | 59       |
 | Caffe        | 9.373  | 16.6606  | 31.4797 | 59.719  |
-##### Notation
+**Notation**
-All the tests in caffe use `caffe time` to execute, which is not including the parameter updating process. But the time in PaddlePaddle and TensorFlow contains it.
+All the experiments in caffe use `caffe time` to execute, which does not include the time of parameter updating. The time in PaddlePaddle and TensorFlow contains it. But, compared with the total time, the time of parameter updating is relatively little.
 In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
@@ -69,13 +69,13 @@ In Tensorflow, they implement algorithm searching method instead of using the al
 - AlexNet,  ms / batch
-| totoal-BatchSize | 128 * 4  | 256 * 4    |
+| total-BatchSize | 128 * 4  | 256 * 4    |
 |------------------|----------| -----------|
 | PaddlePaddle     | 347      | 622        |
 | TensorFlow       | 377      | 675        |
 | Caffe            | 1229     | 2435       |
-For example, if `totoal-BatchSize = 128 * 4`, the speed is calculated by 
+For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by 
 ```
  time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512 
@@ -86,9 +86,9 @@ For example, if `totoal-BatchSize = 128 * 4`, the speed is calculated by
 <img src="figs/alexnet-4gpu.png" width="420">
- GooleNet, ms / batch
+- GoogleNet, ms / batch
-| totoal-BatchSize  | 128 * 4      |  256 * 4    |
+| total-BatchSize  | 128 * 4      |  256 * 4    |
 |-------------------|--------------| ----------- |
 | PaddlePaddle      | 1178         | 2367        |
 | TensorFlow        | 1210         | 2292        |
@@ -102,7 +102,7 @@ We use lstm network for text classfication to test benchmark.
 ### Dataset
 -  [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
- Sequence legth=100, in fact, PaddlePaddle support training with variable-length sequence. But TensorFlow need to pad, in order to compare, we also pad sequence length to 100 in PaddlePaddle.
+- Sequence legth is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad, we also pad sequence length to 100 in PaddlePaddle in order to compare.
 - Dictionary size=30000 
 - Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
@@ -110,7 +110,7 @@ We use lstm network for text classfication to test benchmark.
 #### LSTM in Text Classification
-Testing network for different hidden size, batch size with `2 lstm layer + fc` network.
+Testing `2 lstm layer + fc` network with different hidden size and batch size.
 - Batch size = 64, ms / batch
@@ -138,7 +138,7 @@ Testing network for different hidden size, batch size with `2 lstm layer + fc` n
 #### Seq2Seq
-The benchmark of sequence-to-sequence network will be add later.
+The benchmark of sequence-to-sequence network will be added later.
 ### Multi GPU: 4 GPUs
@@ -165,4 +165,4 @@ The benchmark of sequence-to-sequence network will be add later.
 #### Seq2Seq
-The benchmark of sequence-to-sequence network will be add later.
+The benchmark of sequence-to-sequence network will be added later.
--- a/benchmark/caffe/image/run_multi.sh
+++ b/benchmark/caffe/image/run_multi.sh
@@ -9,7 +9,7 @@ function test() {
  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
  sed -i "1c\net : \"${cfg}\"" solver.prototxt
-  caffe train --solver=solver.prototxt -gpu all > logs/${prefix}-4gpu-batch${batch}.log 2>&1
+  caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
 }
 if [ ! -d "logs" ]; then

--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
 set -e
+# If use `paddle train` to run, it must use DataProvider to
+# pass the data type to PaddlePaddle system.
+# And PaddlePaddle requires training set list (train.list),
 function gen_file() {
  if [ ! -d "train.txt" ]; then
    for ((i=1;i<=1024;i++))
@@ -26,7 +29,6 @@ function train() {
    --log_period=10 \
    --test_period=100 \
    --config_args=$args \
-    --cudnn_dir=/home/dangqingqing/tools/cudnn-5.1/lib64 \
    > logs/$prefix-${thread}gpu-$bz.log 2>&1 
 }
@@ -52,3 +54,12 @@ train smallnet_mnist_cifar.py 1 64 smallnet
 train smallnet_mnist_cifar.py 1 128 smallnet
 train smallnet_mnist_cifar.py 1 256 smallnet
 train smallnet_mnist_cifar.py 1 512 smallnet
+############################
+#========multi-gpus=========#
+train alexnet.py 4 512 alexnet
+train alexnet.py 4 1024 alexnet
+train googlenet.py 4 512 googlenet 
+train googlenet.py 4 1024 googlenet
--- a/benchmark/paddle/image/run_multi.sh
+++ b/benchmark/paddle/image/run_multi.sh
-set -e
-function gen_file() {
-  if [ ! -d "train.txt" ]; then
-    for ((i=1;i<=1024;i++))
-    do
-      echo "train/n09246464/n09246464_38735.jpeg 972" >> train.txt
-    done
-  fi
-  if [ ! -d "train.list" ]; then
-    echo "train.txt" > train.list
-  fi
-}
-function train() {
-  cfg=$1
-  thread=$2
-  bz=$3
-  args="batch_size=$3"
-  prefix=$4
-  paddle train --job=time \
-    --config=$cfg \
-    --use_gpu=True \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --config_args=$args \
-    > logs/$prefix-${thread}gpu-$bz.log 2>&1 
-}
-gen_file
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-#========multi-gpus=========#
-train alexnet.py 4 512 alexnet
-train alexnet.py 4 1024 alexnet
-train googlenet.py 4 512 googlenet 
-train googlenet.py 4 1024 googlenet
--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
@@ -36,3 +36,15 @@ train rnn.py 1 2 1 1280 128
 train rnn.py 1 2 1 256 256 
 train rnn.py 1 2 1 512 256 
 train rnn.py 1 2 1 1280 256 
+#==================multi gpus=====================#
+# hidden_size=256, lstm_num=2, different batch size
+train rnn.py 4 2 1 256 128 
+train rnn.py 4 2 1 256 256 
+train rnn.py 4 2 1 256 512 
+# hidden_size=512, lstm_num=4, different batch size
+train rnn.py 4 2 1 512 128 
+train rnn.py 4 2 1 512 256 
+train rnn.py 4 2 1 512 512 
--- a/benchmark/paddle/rnn/run_multi.sh
+++ b/benchmark/paddle/rnn/run_multi.sh
-set -e
-function train() {
-  cfg=$1
-  thread=$2
-  args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}"
-  paddle train --job=time \
-    --config=$cfg \
-    --use_gpu=1 \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --num_passes=1 \
-    --feed_data=1 \
-    --config_args=$args \
-    >logs/rnn-pad${4}-${thread}gpu-lstm${3}-hid${5}-batch${6}.log 2>&1
-}
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-#-----config--gpu--lstm_num--padding--hidden_size--batch_size
-#==================multi gpus=====================#
-# hidden_size=256, lstm_num=2, different batch size
-train rnn.py 4 2 1 256 128 
-train rnn.py 4 2 1 256 256 
-train rnn.py 4 2 1 256 512 
-# hidden_size=512, lstm_num=4, different batch size
-train rnn.py 4 2 1 512 128 
-train rnn.py 4 2 1 512 256 
-train rnn.py 4 2 1 512 512 
--- a/benchmark/tensorflow/image/alexnet_multi_gpu.py
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
@@ -279,7 +279,6 @@ def run_benchmark():
                                    staircase=True)
    # Create an optimizer that performs gradient descent.
-    # opt = tf.train.GradientDescentOptimizer(lr)
    opt = tf.train.MomentumOptimizer(lr, 0.9)
    # Calculate the gradients for each model tower.

--- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
@@ -222,7 +222,6 @@ def run_benchmark():
    objective = loss(last_layer, labels)
    # Compute gradients.
-    # opt = tf.train.GradientDescentOptimizer(0.001)
    opt = tf.train.MomentumOptimizer(0.001, 0.9)
    grads = opt.compute_gradients(objective) 
    global_step = tf.get_variable('global_step', [],