diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8d2cf5737dd1ca00067a13b4d57286d8a7295cea --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,168 @@ +# Benchmark + +Machine: + +- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz +- GPU: Tesla K40m +- cuDNN: v5.1 +- system: Docker 1.12.1, all platform are tested in docker environment. + +Platform: + +- PaddlePaddle: +- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu +- Caffe: + +Several convolutional neural networks and recurrent neural network are used to test. + +## Image + +### Benchmark Model + +AlexNet, GooleNet and a small network which refer the config of cifar10 in Caffe are used. + +- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one. + +- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark. + +- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt) + + +### Singe-GPU + +- AlexNet: input - 3 * 227 * 227, Time: ms/batch + +| BatchSize | 64 | 128 | 256 | 512 | +|--------------|-----| -----| ------| -----| +| PaddlePaddle | 195 | 334 | 602 | 1629 | +| TensorFlow | 223 | 364 | 645 | 1235 | +| Caffe | 324 | 627 | 1232 | 2513 | + +##### Notation + +All platforms use cuDnn-v5.1. You might see that caffe is slower, because the workspace limit size is 8 * 1024 * 1024 in Caffe's cuDnn-conv interface. This size is larger in PaddlePaddle and TensorFlow. Caffe will be faster if increasing the workspace limit size. + +- GoogletNet: input - 3 * 224 * 224, Time: ms/batch + + +| BatchSize | 64 | 128 | 256 | +|--------------|-------| -------| --------| +| PaddlePaddle | 613 | 1149 | 2348 | +| TensorFlow | 644 | 1176 | 2219 | +| Caffe | 694 | 1364 | out of memory | + +- SmallNet: input - 3 * 32 * 32, Time ms/batch + +| BatchSize | 64 | 128 | 256 | 512 | +|--------------|--------| -------- | --------|---------| +| PaddlePaddle | 10.463 | 18.184 | 33.113 | 63.039 | +| TensorFlow | 9 | 15 | 28 | 59 | +| Caffe | 9.373 | 16.6606 | 31.4797 | 59.719 | + +##### Notation + +All the tests in caffe use `caffe time` to execute, which is not including the parameter updating process. But the time in PaddlePaddle and TensorFlow contains it. + +In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN. + +### Multi-GPU: 4 GPUs + +- AlexNet, ms / batch + +| totoal-BatchSize | 128 * 4 | 256 * 4 | +|------------------|----------| -----------| +| PaddlePaddle | 347 | 622 | +| TensorFlow | 377 | 675 | +| Caffe | 1229 | 2435 | + +For example, if `totoal-BatchSize = 128 * 4`, the speed is calculated by + +``` + time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512 += (334 * 4)/347 += 3.85 +``` + + + + +- GooleNet, ms / batch + +| totoal-BatchSize | 128 * 4 | 256 * 4 | +|-------------------|--------------| ----------- | +| PaddlePaddle | 1178 | 2367 | +| TensorFlow | 1210 | 2292 | +| Caffe | 2007 | out of memory | + + + + +## RNN +We use lstm network for text classfication to test benchmark. + +### Dataset +- [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl) +- Sequence legth=100, in fact, PaddlePaddle support training with variable-length sequence. But TensorFlow need to pad, in order to compare, we also pad sequence length to 100 in PaddlePaddle. +- Dictionary size=30000 +- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow. + +### Single GPU + +#### LSTM in Text Classification + +Testing network for different hidden size, batch size with `2 lstm layer + fc` network. + +- Batch size = 64, ms / batch + +| hidden_size | 256 | 512 | 1280 | +|--------------|-------| -------| --------| +| PaddlePaddle | 83 | 184 | 641 | +| TensorFlow | 175 | 280 | 818 | + +- Batch size = 128, ms / batch + +| hidden_size | 256 | 512 | 1280 | +|--------------|------- | -------| --------| +| PaddlePaddle | 110 | 261 | 1007 | +| TensorFlow | 181 | 361 | 1237 | + + +- Batch size = 256, ms / batch + +| hidden_size | 256 | 512 | 1280 | +|--------------|-------| -------| --------| +| PaddlePaddle | 170 | 414 | 1655 | +| TensorFlow | 238 | 536 | 1905 | + + + +#### Seq2Seq + +The benchmark of sequence-to-sequence network will be add later. + + +### Multi GPU: 4 GPUs + +#### LSTM in Text Classification + +- hidden_size = 256, ms / batch + +| batch_size | 256 | 512 | +|--------------| -------| --------| +| PaddlePaddle | 90 | 118 | +| TensorFlow | 226 | 118 | + + +- hidden_size = 512, ms / batch + +| batch_size | 256 | 512 | +|--------------| -------| --------| +| PaddlePaddle | 189 | 268 | +| TensorFlow | 297 | 383 | + + + + +#### Seq2Seq + +The benchmark of sequence-to-sequence network will be add later. diff --git a/benchmark/caffe/image/alexnet.prototxt b/benchmark/caffe/image/alexnet.prototxt new file mode 100644 index 0000000000000000000000000000000000000000..aca184ddaf2ca2b5e2bea17d131055e0621b8271 --- /dev/null +++ b/benchmark/caffe/image/alexnet.prototxt @@ -0,0 +1,347 @@ +name: "alexnet" +input: "data" +input_dim: 64 +input_dim: 3 +input_dim: 227 +input_dim: 227 +input: "label" +input_dim: 64 +input_dim: 1 +input_dim: 1 +input_dim: 1 +force_backward: true +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "drop6" + type: "Dropout" + bottom: "fc6" + top: "fc6" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "drop7" + type: "Dropout" + bottom: "fc7" + top: "fc7" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} diff --git a/benchmark/caffe/image/googlenet.prototxt b/benchmark/caffe/image/googlenet.prototxt new file mode 100644 index 0000000000000000000000000000000000000000..c5f3b4fe3efcb6f7397031c086997fa914c67b7f --- /dev/null +++ b/benchmark/caffe/image/googlenet.prototxt @@ -0,0 +1,2334 @@ +name: "googlenet" +input: "data" +input_dim: 128 +input_dim: 3 +input_dim: 224 +input_dim: 224 +input: "label" +input_dim: 128 +input_dim: 1 +input_dim: 1 +input_dim: 1 +layer { + name: "conv1/7x7_s2" + type: "Convolution" + bottom: "data" + top: "conv1/7x7_s2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 3 + kernel_size: 7 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "conv1/relu_7x7" + type: "ReLU" + bottom: "conv1/7x7_s2" + top: "conv1/7x7_s2" +} +layer { + name: "pool1/3x3_s2" + type: "Pooling" + bottom: "conv1/7x7_s2" + top: "pool1/3x3_s2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +#layer { +# name: "pool1/norm1" +# type: "LRN" +# bottom: "pool1/3x3_s2" +# top: "pool1/norm1" +# lrn_param { +# local_size: 5 +# alpha: 0.0001 +# beta: 0.75 +# } +#} +layer { + name: "conv2/3x3_reduce" + type: "Convolution" +# bottom: "pool1/norm1" + bottom: "pool1/3x3_s2" + top: "conv2/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "conv2/relu_3x3_reduce" + type: "ReLU" + bottom: "conv2/3x3_reduce" + top: "conv2/3x3_reduce" +} +layer { + name: "conv2/3x3" + type: "Convolution" + bottom: "conv2/3x3_reduce" + top: "conv2/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 192 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "conv2/relu_3x3" + type: "ReLU" + bottom: "conv2/3x3" + top: "conv2/3x3" +} +#layer { +# name: "conv2/norm2" +# type: "LRN" +# bottom: "conv2/3x3" +# top: "conv2/norm2" +# lrn_param { +# local_size: 5 +# alpha: 0.0001 +# beta: 0.75 +# } +#} +layer { + name: "pool2/3x3_s2" + type: "Pooling" +# bottom: "conv2/norm2" + bottom: "conv2/3x3" + top: "pool2/3x3_s2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "inception_3a/1x1" + type: "Convolution" + bottom: "pool2/3x3_s2" + top: "inception_3a/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_1x1" + type: "ReLU" + bottom: "inception_3a/1x1" + top: "inception_3a/1x1" +} +layer { + name: "inception_3a/3x3_reduce" + type: "Convolution" + bottom: "pool2/3x3_s2" + top: "inception_3a/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_3a/3x3_reduce" + top: "inception_3a/3x3_reduce" +} +layer { + name: "inception_3a/3x3" + type: "Convolution" + bottom: "inception_3a/3x3_reduce" + top: "inception_3a/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_3x3" + type: "ReLU" + bottom: "inception_3a/3x3" + top: "inception_3a/3x3" +} +layer { + name: "inception_3a/5x5_reduce" + type: "Convolution" + bottom: "pool2/3x3_s2" + top: "inception_3a/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_3a/5x5_reduce" + top: "inception_3a/5x5_reduce" +} +layer { + name: "inception_3a/5x5" + type: "Convolution" + bottom: "inception_3a/5x5_reduce" + top: "inception_3a/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_5x5" + type: "ReLU" + bottom: "inception_3a/5x5" + top: "inception_3a/5x5" +} +layer { + name: "inception_3a/pool" + type: "Pooling" + bottom: "pool2/3x3_s2" + top: "inception_3a/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_3a/pool_proj" + type: "Convolution" + bottom: "inception_3a/pool" + top: "inception_3a/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_pool_proj" + type: "ReLU" + bottom: "inception_3a/pool_proj" + top: "inception_3a/pool_proj" +} +layer { + name: "inception_3a/output" + type: "Concat" + bottom: "inception_3a/1x1" + bottom: "inception_3a/3x3" + bottom: "inception_3a/5x5" + bottom: "inception_3a/pool_proj" + top: "inception_3a/output" +} +layer { + name: "inception_3b/1x1" + type: "Convolution" + bottom: "inception_3a/output" + top: "inception_3b/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_1x1" + type: "ReLU" + bottom: "inception_3b/1x1" + top: "inception_3b/1x1" +} +layer { + name: "inception_3b/3x3_reduce" + type: "Convolution" + bottom: "inception_3a/output" + top: "inception_3b/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_3b/3x3_reduce" + top: "inception_3b/3x3_reduce" +} +layer { + name: "inception_3b/3x3" + type: "Convolution" + bottom: "inception_3b/3x3_reduce" + top: "inception_3b/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 192 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_3x3" + type: "ReLU" + bottom: "inception_3b/3x3" + top: "inception_3b/3x3" +} +layer { + name: "inception_3b/5x5_reduce" + type: "Convolution" + bottom: "inception_3a/output" + top: "inception_3b/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_3b/5x5_reduce" + top: "inception_3b/5x5_reduce" +} +layer { + name: "inception_3b/5x5" + type: "Convolution" + bottom: "inception_3b/5x5_reduce" + top: "inception_3b/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_5x5" + type: "ReLU" + bottom: "inception_3b/5x5" + top: "inception_3b/5x5" +} +layer { + name: "inception_3b/pool" + type: "Pooling" + bottom: "inception_3a/output" + top: "inception_3b/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_3b/pool_proj" + type: "Convolution" + bottom: "inception_3b/pool" + top: "inception_3b/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_pool_proj" + type: "ReLU" + bottom: "inception_3b/pool_proj" + top: "inception_3b/pool_proj" +} +layer { + name: "inception_3b/output" + type: "Concat" + bottom: "inception_3b/1x1" + bottom: "inception_3b/3x3" + bottom: "inception_3b/5x5" + bottom: "inception_3b/pool_proj" + top: "inception_3b/output" +} +layer { + name: "pool3/3x3_s2" + type: "Pooling" + bottom: "inception_3b/output" + top: "pool3/3x3_s2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "inception_4a/1x1" + type: "Convolution" + bottom: "pool3/3x3_s2" + top: "inception_4a/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 192 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_1x1" + type: "ReLU" + bottom: "inception_4a/1x1" + top: "inception_4a/1x1" +} +layer { + name: "inception_4a/3x3_reduce" + type: "Convolution" + bottom: "pool3/3x3_s2" + top: "inception_4a/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4a/3x3_reduce" + top: "inception_4a/3x3_reduce" +} +layer { + name: "inception_4a/3x3" + type: "Convolution" + bottom: "inception_4a/3x3_reduce" + top: "inception_4a/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 208 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_3x3" + type: "ReLU" + bottom: "inception_4a/3x3" + top: "inception_4a/3x3" +} +layer { + name: "inception_4a/5x5_reduce" + type: "Convolution" + bottom: "pool3/3x3_s2" + top: "inception_4a/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4a/5x5_reduce" + top: "inception_4a/5x5_reduce" +} +layer { + name: "inception_4a/5x5" + type: "Convolution" + bottom: "inception_4a/5x5_reduce" + top: "inception_4a/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 48 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_5x5" + type: "ReLU" + bottom: "inception_4a/5x5" + top: "inception_4a/5x5" +} +layer { + name: "inception_4a/pool" + type: "Pooling" + bottom: "pool3/3x3_s2" + top: "inception_4a/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4a/pool_proj" + type: "Convolution" + bottom: "inception_4a/pool" + top: "inception_4a/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_pool_proj" + type: "ReLU" + bottom: "inception_4a/pool_proj" + top: "inception_4a/pool_proj" +} +layer { + name: "inception_4a/output" + type: "Concat" + bottom: "inception_4a/1x1" + bottom: "inception_4a/3x3" + bottom: "inception_4a/5x5" + bottom: "inception_4a/pool_proj" + top: "inception_4a/output" +} +#layer { +# name: "loss1/ave_pool" +# type: "Pooling" +# bottom: "inception_4a/output" +# top: "loss1/ave_pool" +# pooling_param { +# pool: AVE +# kernel_size: 5 +# stride: 3 +# } +#} +#layer { +# name: "loss1/conv" +# type: "Convolution" +# bottom: "loss1/ave_pool" +# top: "loss1/conv" +# param { +# lr_mult: 1 +# decay_mult: 1 +# } +# param { +# lr_mult: 2 +# decay_mult: 0 +# } +# convolution_param { +# num_output: 128 +# kernel_size: 1 +# weight_filler { +# type: "xavier" +# } +# bias_filler { +# type: "constant" +# value: 0.2 +# } +# } +#} +#layer { +# name: "loss1/relu_conv" +# type: "ReLU" +# bottom: "loss1/conv" +# top: "loss1/conv" +#} +#layer { +# name: "loss1/fc" +# type: "InnerProduct" +# bottom: "loss1/conv" +# top: "loss1/fc" +# param { +# lr_mult: 1 +# decay_mult: 1 +# } +# param { +# lr_mult: 2 +# decay_mult: 0 +# } +# inner_product_param { +# num_output: 1024 +# weight_filler { +# type: "xavier" +# } +# bias_filler { +# type: "constant" +# value: 0.2 +# } +# } +#} +#layer { +# name: "loss1/relu_fc" +# type: "ReLU" +# bottom: "loss1/fc" +# top: "loss1/fc" +#} +#layer { +# name: "loss1/drop_fc" +# type: "Dropout" +# bottom: "loss1/fc" +# top: "loss1/fc" +# dropout_param { +# dropout_ratio: 0.7 +# } +#} +#layer { +# name: "loss1/classifier" +# type: "InnerProduct" +# bottom: "loss1/fc" +# top: "loss1/classifier" +# param { +# lr_mult: 1 +# decay_mult: 1 +# } +# param { +# lr_mult: 2 +# decay_mult: 0 +# } +# inner_product_param { +# num_output: 1000 +# weight_filler { +# type: "xavier" +# } +# bias_filler { +# type: "constant" +# value: 0 +# } +# } +#} +#layer { +# name: "loss1/loss" +# type: "SoftmaxWithLoss" +# bottom: "loss1/classifier" +# bottom: "label" +# top: "loss1/loss1" +# loss_weight: 0.3 +#} +layer { + name: "inception_4b/1x1" + type: "Convolution" + bottom: "inception_4a/output" + top: "inception_4b/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 160 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_1x1" + type: "ReLU" + bottom: "inception_4b/1x1" + top: "inception_4b/1x1" +} +layer { + name: "inception_4b/3x3_reduce" + type: "Convolution" + bottom: "inception_4a/output" + top: "inception_4b/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 112 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4b/3x3_reduce" + top: "inception_4b/3x3_reduce" +} +layer { + name: "inception_4b/3x3" + type: "Convolution" + bottom: "inception_4b/3x3_reduce" + top: "inception_4b/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 224 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_3x3" + type: "ReLU" + bottom: "inception_4b/3x3" + top: "inception_4b/3x3" +} +layer { + name: "inception_4b/5x5_reduce" + type: "Convolution" + bottom: "inception_4a/output" + top: "inception_4b/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4b/5x5_reduce" + top: "inception_4b/5x5_reduce" +} +layer { + name: "inception_4b/5x5" + type: "Convolution" + bottom: "inception_4b/5x5_reduce" + top: "inception_4b/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_5x5" + type: "ReLU" + bottom: "inception_4b/5x5" + top: "inception_4b/5x5" +} +layer { + name: "inception_4b/pool" + type: "Pooling" + bottom: "inception_4a/output" + top: "inception_4b/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4b/pool_proj" + type: "Convolution" + bottom: "inception_4b/pool" + top: "inception_4b/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_pool_proj" + type: "ReLU" + bottom: "inception_4b/pool_proj" + top: "inception_4b/pool_proj" +} +layer { + name: "inception_4b/output" + type: "Concat" + bottom: "inception_4b/1x1" + bottom: "inception_4b/3x3" + bottom: "inception_4b/5x5" + bottom: "inception_4b/pool_proj" + top: "inception_4b/output" +} +layer { + name: "inception_4c/1x1" + type: "Convolution" + bottom: "inception_4b/output" + top: "inception_4c/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_1x1" + type: "ReLU" + bottom: "inception_4c/1x1" + top: "inception_4c/1x1" +} +layer { + name: "inception_4c/3x3_reduce" + type: "Convolution" + bottom: "inception_4b/output" + top: "inception_4c/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4c/3x3_reduce" + top: "inception_4c/3x3_reduce" +} +layer { + name: "inception_4c/3x3" + type: "Convolution" + bottom: "inception_4c/3x3_reduce" + top: "inception_4c/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_3x3" + type: "ReLU" + bottom: "inception_4c/3x3" + top: "inception_4c/3x3" +} +layer { + name: "inception_4c/5x5_reduce" + type: "Convolution" + bottom: "inception_4b/output" + top: "inception_4c/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4c/5x5_reduce" + top: "inception_4c/5x5_reduce" +} +layer { + name: "inception_4c/5x5" + type: "Convolution" + bottom: "inception_4c/5x5_reduce" + top: "inception_4c/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_5x5" + type: "ReLU" + bottom: "inception_4c/5x5" + top: "inception_4c/5x5" +} +layer { + name: "inception_4c/pool" + type: "Pooling" + bottom: "inception_4b/output" + top: "inception_4c/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4c/pool_proj" + type: "Convolution" + bottom: "inception_4c/pool" + top: "inception_4c/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_pool_proj" + type: "ReLU" + bottom: "inception_4c/pool_proj" + top: "inception_4c/pool_proj" +} +layer { + name: "inception_4c/output" + type: "Concat" + bottom: "inception_4c/1x1" + bottom: "inception_4c/3x3" + bottom: "inception_4c/5x5" + bottom: "inception_4c/pool_proj" + top: "inception_4c/output" +} +layer { + name: "inception_4d/1x1" + type: "Convolution" + bottom: "inception_4c/output" + top: "inception_4d/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 112 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_1x1" + type: "ReLU" + bottom: "inception_4d/1x1" + top: "inception_4d/1x1" +} +layer { + name: "inception_4d/3x3_reduce" + type: "Convolution" + bottom: "inception_4c/output" + top: "inception_4d/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 144 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4d/3x3_reduce" + top: "inception_4d/3x3_reduce" +} +layer { + name: "inception_4d/3x3" + type: "Convolution" + bottom: "inception_4d/3x3_reduce" + top: "inception_4d/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 288 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_3x3" + type: "ReLU" + bottom: "inception_4d/3x3" + top: "inception_4d/3x3" +} +layer { + name: "inception_4d/5x5_reduce" + type: "Convolution" + bottom: "inception_4c/output" + top: "inception_4d/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4d/5x5_reduce" + top: "inception_4d/5x5_reduce" +} +layer { + name: "inception_4d/5x5" + type: "Convolution" + bottom: "inception_4d/5x5_reduce" + top: "inception_4d/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_5x5" + type: "ReLU" + bottom: "inception_4d/5x5" + top: "inception_4d/5x5" +} +layer { + name: "inception_4d/pool" + type: "Pooling" + bottom: "inception_4c/output" + top: "inception_4d/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4d/pool_proj" + type: "Convolution" + bottom: "inception_4d/pool" + top: "inception_4d/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_pool_proj" + type: "ReLU" + bottom: "inception_4d/pool_proj" + top: "inception_4d/pool_proj" +} +layer { + name: "inception_4d/output" + type: "Concat" + bottom: "inception_4d/1x1" + bottom: "inception_4d/3x3" + bottom: "inception_4d/5x5" + bottom: "inception_4d/pool_proj" + top: "inception_4d/output" +} +#layer { +# name: "loss2/ave_pool" +# type: "Pooling" +# bottom: "inception_4d/output" +# top: "loss2/ave_pool" +# pooling_param { +# pool: AVE +# kernel_size: 5 +# stride: 3 +# } +#} +#layer { +# name: "loss2/conv" +# type: "Convolution" +# bottom: "loss2/ave_pool" +# top: "loss2/conv" +# param { +# lr_mult: 1 +# decay_mult: 1 +# } +# param { +# lr_mult: 2 +# decay_mult: 0 +# } +# convolution_param { +# num_output: 128 +# kernel_size: 1 +# weight_filler { +# type: "xavier" +# } +# bias_filler { +# type: "constant" +# value: 0.2 +# } +# } +#} +#layer { +# name: "loss2/relu_conv" +# type: "ReLU" +# bottom: "loss2/conv" +# top: "loss2/conv" +#} +#layer { +# name: "loss2/fc" +# type: "InnerProduct" +# bottom: "loss2/conv" +# top: "loss2/fc" +# param { +# lr_mult: 1 +# decay_mult: 1 +# } +# param { +# lr_mult: 2 +# decay_mult: 0 +# } +# inner_product_param { +# num_output: 1024 +# weight_filler { +# type: "xavier" +# } +# bias_filler { +# type: "constant" +# value: 0.2 +# } +# } +#} +#layer { +# name: "loss2/relu_fc" +# type: "ReLU" +# bottom: "loss2/fc" +# top: "loss2/fc" +#} +#layer { +# name: "loss2/drop_fc" +# type: "Dropout" +# bottom: "loss2/fc" +# top: "loss2/fc" +# dropout_param { +# dropout_ratio: 0.7 +# } +#} +#layer { +# name: "loss2/classifier" +# type: "InnerProduct" +# bottom: "loss2/fc" +# top: "loss2/classifier" +# param { +# lr_mult: 1 +# decay_mult: 1 +# } +# param { +# lr_mult: 2 +# decay_mult: 0 +# } +# inner_product_param { +# num_output: 1000 +# weight_filler { +# type: "xavier" +# } +# bias_filler { +# type: "constant" +# value: 0 +# } +# } +#} +#layer { +# name: "loss2/loss" +# type: "SoftmaxWithLoss" +# bottom: "loss2/classifier" +# bottom: "label" +# top: "loss2/loss1" +# loss_weight: 0.3 +#} +layer { + name: "inception_4e/1x1" + type: "Convolution" + bottom: "inception_4d/output" + top: "inception_4e/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_1x1" + type: "ReLU" + bottom: "inception_4e/1x1" + top: "inception_4e/1x1" +} +layer { + name: "inception_4e/3x3_reduce" + type: "Convolution" + bottom: "inception_4d/output" + top: "inception_4e/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 160 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4e/3x3_reduce" + top: "inception_4e/3x3_reduce" +} +layer { + name: "inception_4e/3x3" + type: "Convolution" + bottom: "inception_4e/3x3_reduce" + top: "inception_4e/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 320 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_3x3" + type: "ReLU" + bottom: "inception_4e/3x3" + top: "inception_4e/3x3" +} +layer { + name: "inception_4e/5x5_reduce" + type: "Convolution" + bottom: "inception_4d/output" + top: "inception_4e/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4e/5x5_reduce" + top: "inception_4e/5x5_reduce" +} +layer { + name: "inception_4e/5x5" + type: "Convolution" + bottom: "inception_4e/5x5_reduce" + top: "inception_4e/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_5x5" + type: "ReLU" + bottom: "inception_4e/5x5" + top: "inception_4e/5x5" +} +layer { + name: "inception_4e/pool" + type: "Pooling" + bottom: "inception_4d/output" + top: "inception_4e/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4e/pool_proj" + type: "Convolution" + bottom: "inception_4e/pool" + top: "inception_4e/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_pool_proj" + type: "ReLU" + bottom: "inception_4e/pool_proj" + top: "inception_4e/pool_proj" +} +layer { + name: "inception_4e/output" + type: "Concat" + bottom: "inception_4e/1x1" + bottom: "inception_4e/3x3" + bottom: "inception_4e/5x5" + bottom: "inception_4e/pool_proj" + top: "inception_4e/output" +} +layer { + name: "pool4/3x3_s2" + type: "Pooling" + bottom: "inception_4e/output" + top: "pool4/3x3_s2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "inception_5a/1x1" + type: "Convolution" + bottom: "pool4/3x3_s2" + top: "inception_5a/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_1x1" + type: "ReLU" + bottom: "inception_5a/1x1" + top: "inception_5a/1x1" +} +layer { + name: "inception_5a/3x3_reduce" + type: "Convolution" + bottom: "pool4/3x3_s2" + top: "inception_5a/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 160 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_5a/3x3_reduce" + top: "inception_5a/3x3_reduce" +} +layer { + name: "inception_5a/3x3" + type: "Convolution" + bottom: "inception_5a/3x3_reduce" + top: "inception_5a/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 320 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_3x3" + type: "ReLU" + bottom: "inception_5a/3x3" + top: "inception_5a/3x3" +} +layer { + name: "inception_5a/5x5_reduce" + type: "Convolution" + bottom: "pool4/3x3_s2" + top: "inception_5a/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_5a/5x5_reduce" + top: "inception_5a/5x5_reduce" +} +layer { + name: "inception_5a/5x5" + type: "Convolution" + bottom: "inception_5a/5x5_reduce" + top: "inception_5a/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_5x5" + type: "ReLU" + bottom: "inception_5a/5x5" + top: "inception_5a/5x5" +} +layer { + name: "inception_5a/pool" + type: "Pooling" + bottom: "pool4/3x3_s2" + top: "inception_5a/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_5a/pool_proj" + type: "Convolution" + bottom: "inception_5a/pool" + top: "inception_5a/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_pool_proj" + type: "ReLU" + bottom: "inception_5a/pool_proj" + top: "inception_5a/pool_proj" +} +layer { + name: "inception_5a/output" + type: "Concat" + bottom: "inception_5a/1x1" + bottom: "inception_5a/3x3" + bottom: "inception_5a/5x5" + bottom: "inception_5a/pool_proj" + top: "inception_5a/output" +} +layer { + name: "inception_5b/1x1" + type: "Convolution" + bottom: "inception_5a/output" + top: "inception_5b/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_1x1" + type: "ReLU" + bottom: "inception_5b/1x1" + top: "inception_5b/1x1" +} +layer { + name: "inception_5b/3x3_reduce" + type: "Convolution" + bottom: "inception_5a/output" + top: "inception_5b/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 192 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_5b/3x3_reduce" + top: "inception_5b/3x3_reduce" +} +layer { + name: "inception_5b/3x3" + type: "Convolution" + bottom: "inception_5b/3x3_reduce" + top: "inception_5b/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_3x3" + type: "ReLU" + bottom: "inception_5b/3x3" + top: "inception_5b/3x3" +} +layer { + name: "inception_5b/5x5_reduce" + type: "Convolution" + bottom: "inception_5a/output" + top: "inception_5b/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 48 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_5b/5x5_reduce" + top: "inception_5b/5x5_reduce" +} +layer { + name: "inception_5b/5x5" + type: "Convolution" + bottom: "inception_5b/5x5_reduce" + top: "inception_5b/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_5x5" + type: "ReLU" + bottom: "inception_5b/5x5" + top: "inception_5b/5x5" +} +layer { + name: "inception_5b/pool" + type: "Pooling" + bottom: "inception_5a/output" + top: "inception_5b/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_5b/pool_proj" + type: "Convolution" + bottom: "inception_5b/pool" + top: "inception_5b/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_pool_proj" + type: "ReLU" + bottom: "inception_5b/pool_proj" + top: "inception_5b/pool_proj" +} +layer { + name: "inception_5b/output" + type: "Concat" + bottom: "inception_5b/1x1" + bottom: "inception_5b/3x3" + bottom: "inception_5b/5x5" + bottom: "inception_5b/pool_proj" + top: "inception_5b/output" +} +layer { + name: "pool5/7x7_s1" + type: "Pooling" + bottom: "inception_5b/output" + top: "pool5/7x7_s1" + pooling_param { + pool: AVE + kernel_size: 7 + stride: 1 + } +} +layer { + name: "pool5/drop_7x7_s1" + type: "Dropout" + bottom: "pool5/7x7_s1" + top: "pool5/7x7_s1" + dropout_param { + dropout_ratio: 0.4 + } +} +layer { + name: "loss3/classifier" + type: "InnerProduct" + bottom: "pool5/7x7_s1" + top: "loss3/classifier" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "loss3/loss3" + type: "SoftmaxWithLoss" + bottom: "loss3/classifier" + bottom: "label" + top: "loss3/loss3" + loss_weight: 1 +} diff --git a/benchmark/caffe/image/run.sh b/benchmark/caffe/image/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..aa9ac20ca5cc1d48a07ce39f7d6c6d70ad4121ab --- /dev/null +++ b/benchmark/caffe/image/run.sh @@ -0,0 +1,30 @@ +set -e + +function test() { + cfg=$1 + batch=$2 + prefix=$3 + sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg + sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg + caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1 +} + +if [ ! -d "logs" ]; then + mkdir logs +fi + +# alexnet +test alexnet.prototxt 64 alexnet +test alexnet.prototxt 128 alexnet +test alexnet.prototxt 256 alexnet +test alexnet.prototxt 512 alexnet + +# googlenet +test googlenet.prototxt 64 googlenet +test googlenet.prototxt 128 googlenet + +# small net +test smallnet_mnist_cifar.prototxt 64 smallnet +test smallnet_mnist_cifar.prototxt 128 smallnet +test smallnet_mnist_cifar.prototxt 256 smallnet +test smallnet_mnist_cifar.prototxt 512 smallnet diff --git a/benchmark/caffe/image/run_multi.sh b/benchmark/caffe/image/run_multi.sh new file mode 100755 index 0000000000000000000000000000000000000000..f72b062c11cb63ed2022166af424b1ffa7c42976 --- /dev/null +++ b/benchmark/caffe/image/run_multi.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +function test() { + cfg=$1 + batch=$2 + prefix=$3 + batch_per_gpu=`expr ${batch} / 4` + sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg + sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg + sed -i "1c\net : \"${cfg}\"" solver.prototxt + caffe train --solver=solver.prototxt -gpu all > logs/${prefix}-4gpu-batch${batch}.log 2>&1 +} + +if [ ! -d "logs" ]; then + mkdir logs +fi + +# alexnet +test alexnet.prototxt 512 alexnet +test alexnet.prototxt 1024 alexnet + +# googlnet +test googlenet.prototxt 512 googlenet diff --git a/benchmark/caffe/image/smallnet_mnist_cifar.prototxt b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt new file mode 100644 index 0000000000000000000000000000000000000000..3cb0e32bbfb9f785ece6d428356987e5503dd25d --- /dev/null +++ b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt @@ -0,0 +1,198 @@ +name: "mnist/cifar" +input: "data" +input_dim: 128 +input_dim: 3 +input_dim: 32 +input_dim: 32 +input: "label" +input_dim: 128 +input_dim: 1 +input_dim: 1 +input_dim: 1 +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + convolution_param { + num_output: 32 + pad: 2 + kernel_size: 5 + stride: 1 + weight_filler { + type: "gaussian" + std: 0.0001 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "pool1" + top: "pool1" +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + convolution_param { + num_output: 32 + pad: 2 + kernel_size: 5 + stride: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2" + top: "pool2" + pooling_param { + pool: AVE + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + convolution_param { + num_output: 64 + pad: 2 + kernel_size: 5 + stride: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "pool3" + type: "Pooling" + bottom: "conv3" + top: "pool3" + pooling_param { + pool: AVE + kernel_size: 3 + stride: 2 + } +} +layer { + name: "ip1" + type: "InnerProduct" + bottom: "pool3" + top: "ip1" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + inner_product_param { + num_output: 64 + weight_filler { + type: "gaussian" + std: 0.1 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "ip2" + type: "InnerProduct" + bottom: "ip1" + top: "ip2" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + inner_product_param { + num_output: 10 + weight_filler { + type: "gaussian" + std: 0.1 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "ip2" + bottom: "label" + top: "accuracy" + include { + phase: TEST + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "ip2" + bottom: "label" + top: "loss" +} diff --git a/benchmark/caffe/image/solver.prototxt b/benchmark/caffe/image/solver.prototxt new file mode 100644 index 0000000000000000000000000000000000000000..61c10284e6027b4cc0b3d4c8fcf949e0a5a22a85 --- /dev/null +++ b/benchmark/caffe/image/solver.prototxt @@ -0,0 +1,10 @@ +net: "alexnet.prototxt" +base_lr: 0.01 +lr_policy: "fixed" +display: 20 +max_iter: 200 +momentum: 0.9 +weight_decay: 0.0005 +snapshot: 10000 +snapshot_prefix: "models/caffe_alexnet_train" +solver_mode: GPU diff --git a/benchmark/figs/alexnet-4gpu.png b/benchmark/figs/alexnet-4gpu.png new file mode 100644 index 0000000000000000000000000000000000000000..864c11313d7eacc90910a902c6a3a39c2bef77cc Binary files /dev/null and b/benchmark/figs/alexnet-4gpu.png differ diff --git a/benchmark/figs/googlenet-4gpu.png b/benchmark/figs/googlenet-4gpu.png new file mode 100644 index 0000000000000000000000000000000000000000..098ed35bf7763d34993357501f16b6d859c8733f Binary files /dev/null and b/benchmark/figs/googlenet-4gpu.png differ diff --git a/benchmark/figs/rnn_lstm_4gpus.png b/benchmark/figs/rnn_lstm_4gpus.png new file mode 100644 index 0000000000000000000000000000000000000000..973ce2fa5f65e9681c972d4f5bd5776b5c4aa264 Binary files /dev/null and b/benchmark/figs/rnn_lstm_4gpus.png differ diff --git a/benchmark/figs/rnn_lstm_cls.png b/benchmark/figs/rnn_lstm_cls.png new file mode 100644 index 0000000000000000000000000000000000000000..26d05cac11aa7ae8cdfbcd8c4401f6547a9404f6 Binary files /dev/null and b/benchmark/figs/rnn_lstm_cls.png differ diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py new file mode 100644 index 0000000000000000000000000000000000000000..8b832473231f9894f99830149b96a14a923197f4 --- /dev/null +++ b/benchmark/paddle/image/alexnet.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +from paddle.trainer_config_helpers import * + +height=227 +width=227 +num_class = 1000 +batch_size = get_config_arg('batch_size', int, 128) + +args={'height':height, 'width':width, 'color':True, 'num_class':num_class} +define_py_data_sources2("train.list", + None, + module="provider", + obj="process", + args=args) + + +settings( + batch_size = batch_size, + learning_rate = 0.01 / batch_size, + learning_method = MomentumOptimizer(0.9), + regularization = L2Regularization(0.0005 * batch_size) +) + + +# conv1 +net = data_layer('data', size=height * width * 3) +net = img_conv_layer(input=net, filter_size=11, num_channels=3, + num_filters=96, stride=4, padding=1) +net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75) +net = img_pool_layer(input=net, pool_size=3, stride=2) + +# conv2 +net = img_conv_layer(input=net, filter_size=5, num_filters=256, + stride=1, padding=2, groups=1) +net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75) +net = img_pool_layer(input=net, pool_size=3, stride=2) + +# conv3 +net = img_conv_layer(input=net, filter_size=3, num_filters=384, + stride=1, padding=1) +# conv4 +net = img_conv_layer(input=net, filter_size=3, num_filters=384, + stride=1, padding=1, groups=1) + +# conv5 +net = img_conv_layer(input=net, filter_size=3, num_filters=256, + stride=1, padding=1, groups=1) +net = img_pool_layer(input=net, pool_size=3, stride=2) + +net = fc_layer(input=net, size=4096, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5)) +net = fc_layer(input=net, size=4096, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5)) +net = fc_layer(input=net, size=1000, act=SoftmaxActivation()) + +lab = data_layer('label', num_class) +loss = cross_entropy(input=net, label=lab) +outputs(loss) diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py new file mode 100644 index 0000000000000000000000000000000000000000..1078136a2b40b69c7e4b361487d22c414af7501f --- /dev/null +++ b/benchmark/paddle/image/googlenet.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python +from paddle.trainer_config_helpers import * + +height=224 +width=224 +num_class = 1000 +batch_size = get_config_arg('batch_size', int, 128) + +args={'height':height, 'width':width, 'color':True, 'num_class':num_class} +define_py_data_sources2("train.list", + None, + module="provider", + obj="process", + args=args) + +settings( + batch_size = batch_size, + learning_rate = 0.01 / batch_size, + learning_method = MomentumOptimizer(0.9), + regularization = L2Regularization(0.0005 * batch_size) +) + +def inception2(name, input, channels, \ + filter1, + filter3R, filter3, + filter5R, filter5, + proj): + + conv1 = name + '_1' + conv3r = name + '_3r' + conv3 = name + '_3' + conv5r = name + '_5r' + conv5 = name + '_5' + maxpool = name + '_max' + convproj = name + '_proj' + + cov1 = img_conv_layer(name=conv1, input=input, filter_size=1, + num_channels=channels, num_filters=filter1, + stride=1, padding=0) + + cov3r = img_conv_layer(name=conv3r, input=input, filter_size=1, + num_channels=channels, num_filters=filter3R, + stride=1, padding=0) + cov3 = img_conv_layer(name=conv3, input=cov3r, filter_size=3, + num_filters=filter3, stride=1, padding=1) + + cov5r = img_conv_layer(name=conv5r, input=input, filter_size=1, + num_channels=channels, num_filters=filter5R, + stride=1, padding=0) + cov5 = img_conv_layer(name=conv5, input=cov5r, filter_size=5, + num_filters=filter5, stride=1, padding=2) + + pool1 = img_pool_layer(name=maxpool, input=input, pool_size=3, + num_channels=channels, stride=1, padding=1) + covprj = img_conv_layer(name=convproj, input=pool1, filter_size=1, + num_filters=proj, stride=1, padding=0) + + cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj]) + return cat + +def inception(name, input, channels, \ + filter1, + filter3R, filter3, + filter5R, filter5, + proj): + + cov1 = conv_projection(input=input, filter_size=1, num_channels=channels, + num_filters=filter1, stride=1, padding=0) + + cov3r = img_conv_layer(name=name + '_3r', input=input, filter_size=1, + num_channels=channels, num_filters=filter3R, + stride=1, padding=0) + cov3 = conv_projection(input=cov3r, filter_size=3, num_filters=filter3, + stride=1, padding=1) + + cov5r = img_conv_layer(name=name + '_5r', input=input, filter_size=1, + num_channels=channels, num_filters=filter5R, + stride=1, padding=0) + cov5 = conv_projection(input=cov5r, filter_size=5, num_filters=filter5, + stride=1, padding=2) + + pool1 = img_pool_layer(name=name + '_max', input=input, pool_size=3, + num_channels=channels, stride=1, padding=1) + covprj = conv_projection(input=pool1, filter_size=1, num_filters=proj, + stride=1, padding=0) + + cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj], + bias_attr=True, act=ReluActivation()) + return cat + + +lab = data_layer(name="label", size=1000) +data = data_layer(name="input", size=3 * height * width) + +# stage 1 +conv1 = img_conv_layer(name="conv1", input=data, filter_size=7, + num_channels=3, num_filters=64, stride=2, padding=3) +pool1 = img_pool_layer(name="pool1", input=conv1, pool_size=3, + num_channels=64, stride=2) + +# stage 2 +conv2_1 = img_conv_layer(name="conv2_1", input=pool1, filter_size=1, + num_filters=64, stride=1, padding=0) +conv2_2 = img_conv_layer(name="conv2_2", input=conv2_1, filter_size=3, + num_filters=192, stride=1, padding=1) +pool2 = img_pool_layer(name="pool2", input=conv2_2, pool_size=3, + num_channels=192, stride=2) + +# stage 3 +ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32) +ince3b = inception("ince3b", ince3a, 256, 128, 128,192, 32, 96, 64) +pool3 = img_pool_layer(name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2) + +# stage 4 +ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64) +ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64) +ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64) +ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64) +ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128) +pool4 = img_pool_layer(name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2) + +# stage 5 +ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128) +ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128) +pool5 = img_pool_layer(name="pool5", input=ince5b, num_channels=1024, pool_size=7, stride=7, pool_type=AvgPooling()) + +# We remove loss1 and loss2 for all system when testing benchmark +# output 1 +# pool_o1 = img_pool_layer(name="pool_o1", input=ince4a, num_channels=512, pool_size=5, stride=3, pool_type=AvgPooling()) +# conv_o1 = img_conv_layer(name="conv_o1", input=pool_o1, filter_size=1, num_filters=128, stride=1, padding=0) +# fc_o1 = fc_layer(name="fc_o1", input=conv_o1, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation()) +# out1 = fc_layer(name="output1", input=fc_o1, size=1000, act=SoftmaxActivation()) +# loss1 = cross_entropy(name='loss1', input=out1, label=lab, coeff=0.3) + +# output 2 +#pool_o2 = img_pool_layer(name="pool_o2", input=ince4d, num_channels=528, pool_size=5, stride=3, pool_type=AvgPooling()) +#conv_o2 = img_conv_layer(name="conv_o2", input=pool_o2, filter_size=1, num_filters=128, stride=1, padding=0) +#fc_o2 = fc_layer(name="fc_o2", input=conv_o2, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation()) +#out2 = fc_layer(name="output2", input=fc_o2, size=1000, act=SoftmaxActivation()) +#loss2 = cross_entropy(name='loss2', input=out2, label=lab, coeff=0.3) + +# output 3 +dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4) +out3 = fc_layer(name="output3", input=dropout, size=1000, act=SoftmaxActivation()) +loss3 = cross_entropy(name='loss3', input=out3, label=lab) + +outputs(loss3) diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py new file mode 100644 index 0000000000000000000000000000000000000000..0d45268aa3f4900349e176a56acc9a9eb6eb120b --- /dev/null +++ b/benchmark/paddle/image/provider.py @@ -0,0 +1,24 @@ +import io,os +import random +import numpy as np +from paddle.trainer.PyDataProvider2 import * + +def initHook(settings, height, width, color, num_class, **kwargs): + settings.height = height + settings.width = width + settings.color = color + settings.num_class = num_class + if settings.color: + settings.data_size = settings.height * settings.width * 3 + else: + settings.data_size = settings.height * settings.width + + settings.slots = [dense_vector(settings.data_size), integer_value(1)] + +@provider(init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM) +def process(settings, file_list): + with open(file_list, 'r') as fdata: + for line in fdata: + img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten() + lab = random.randint(0, settings.num_class) + yield img.tolist(), int(lab) diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..6fccf7854c603b4aff47a36d19827eba9cec9afa --- /dev/null +++ b/benchmark/paddle/image/run.sh @@ -0,0 +1,54 @@ +set -e + +function gen_file() { + if [ ! -d "train.txt" ]; then + for ((i=1;i<=1024;i++)) + do + echo "train/n09246464/n09246464_38735.jpeg 972" >> train.txt + done + fi + + if [ ! -d "train.list" ]; then + echo "train.txt" > train.list + fi +} + +function train() { + cfg=$1 + thread=$2 + bz=$3 + args="batch_size=$3" + prefix=$4 + paddle train --job=time \ + --config=$cfg \ + --use_gpu=True \ + --trainer_count=$thread \ + --log_period=10 \ + --test_period=100 \ + --config_args=$args \ + --cudnn_dir=/home/dangqingqing/tools/cudnn-5.1/lib64 \ + > logs/$prefix-${thread}gpu-$bz.log 2>&1 +} + +gen_file +if [ ! -d "logs" ]; then + mkdir logs +fi + +#========single-gpu=========# +# alexnet +train alexnet.py 1 64 alexnet +train alexnet.py 1 128 alexnet +train alexnet.py 1 256 alexnet +train alexnet.py 1 512 alexnet + +# googlenet +train googlenet.py 1 64 googlenet +train googlenet.py 1 128 googlenet +train googlenet.py 1 256 googlenet + +# smallnet +train smallnet_mnist_cifar.py 1 64 smallnet +train smallnet_mnist_cifar.py 1 128 smallnet +train smallnet_mnist_cifar.py 1 256 smallnet +train smallnet_mnist_cifar.py 1 512 smallnet diff --git a/benchmark/paddle/image/run_multi.sh b/benchmark/paddle/image/run_multi.sh new file mode 100755 index 0000000000000000000000000000000000000000..c506668fe0b42883270aba6405cd2c44a8c2297c --- /dev/null +++ b/benchmark/paddle/image/run_multi.sh @@ -0,0 +1,42 @@ +set -e + +function gen_file() { + if [ ! -d "train.txt" ]; then + for ((i=1;i<=1024;i++)) + do + echo "train/n09246464/n09246464_38735.jpeg 972" >> train.txt + done + fi + + if [ ! -d "train.list" ]; then + echo "train.txt" > train.list + fi +} + +function train() { + cfg=$1 + thread=$2 + bz=$3 + args="batch_size=$3" + prefix=$4 + paddle train --job=time \ + --config=$cfg \ + --use_gpu=True \ + --trainer_count=$thread \ + --log_period=10 \ + --test_period=100 \ + --config_args=$args \ + > logs/$prefix-${thread}gpu-$bz.log 2>&1 +} + +gen_file +if [ ! -d "logs" ]; then + mkdir logs +fi + +#========multi-gpus=========# +train alexnet.py 4 512 alexnet +train alexnet.py 4 1024 alexnet + +train googlenet.py 4 512 googlenet +train googlenet.py 4 1024 googlenet diff --git a/benchmark/paddle/image/smallnet_mnist_cifar.py b/benchmark/paddle/image/smallnet_mnist_cifar.py new file mode 100644 index 0000000000000000000000000000000000000000..78dba880d29250158326b23834a60273407eb111 --- /dev/null +++ b/benchmark/paddle/image/smallnet_mnist_cifar.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +from paddle.trainer_config_helpers import * + +height=32 +width=32 +num_class = 10 + +batch_size = get_config_arg('batch_size', int, 128) + +args={'height':height, 'width':width, 'color':True, 'num_class':num_class} +define_py_data_sources2("train.list", + None, + module="provider", + obj="process", + args=args) + +settings( + batch_size = batch_size, + learning_rate = 0.01 / batch_size, + learning_method = MomentumOptimizer(0.9), + regularization = L2Regularization(0.0005 * batch_size) +) + + +# conv1 +net = data_layer('data', size=height * width * 3) +net = img_conv_layer(input=net, filter_size=5, num_channels=3, + num_filters=32, stride=1, padding=2) +net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1) + +# conv2 +net = img_conv_layer(input=net, filter_size=5, num_filters=32, + stride=1, padding=2) +net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling()) + +# conv3 +net = img_conv_layer(input=net, filter_size=3, num_filters=64, + stride=1, padding=1) +net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling()) + +net = fc_layer(input=net, size=64, act=ReluActivation()) +net = fc_layer(input=net, size=10, act=SoftmaxActivation()) + +lab = data_layer('label', num_class) +loss = classification_cost(input=net, label=lab) +outputs(loss) diff --git a/benchmark/paddle/rnn/imdb.py b/benchmark/paddle/rnn/imdb.py new file mode 100755 index 0000000000000000000000000000000000000000..93e1686854b447c4248ae1809fb5289a36e3e0f7 --- /dev/null +++ b/benchmark/paddle/rnn/imdb.py @@ -0,0 +1,42 @@ +from __future__ import print_function +import six.moves.cPickle as pickle +import gzip +import os +import numpy + +def get_dataset_file(dataset, default_dataset, origin): + data_dir, data_file = os.path.split(dataset) + if (not os.path.isfile(dataset)) and data_file == default_dataset: + from six.moves import urllib + print('Downloading data from %s' % origin) + urllib.request.urlretrieve(origin, dataset) + + return dataset + +def create_data(path="imdb.pkl"): + + if (not os.path.isfile('imdb.train.pkl')): + path = get_dataset_file( + path, "imdb.pkl", + "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl") + + if path.endswith(".gz"): + f = gzip.open(path, 'rb') + else: + f = open(path, 'rb') + + train_set = pickle.load(f) + test_set = pickle.load(f) + f.close() + + pickle.dump(train_set, open('imdb.train.pkl', 'wb')) + pickle.dump(test_set, open('imdb.test.pkl', 'wb')) + + if (not os.path.isfile('train.list')): + file('train.list', 'w').write('imdb.train.pkl\n') + +def main(): + create_data('imdb.pkl') + +if __name__ == "__main__": + main() diff --git a/benchmark/paddle/rnn/provider.py b/benchmark/paddle/rnn/provider.py new file mode 100644 index 0000000000000000000000000000000000000000..90d3fee67601604b236b27fb2e5492e92095cb72 --- /dev/null +++ b/benchmark/paddle/rnn/provider.py @@ -0,0 +1,64 @@ +import io,os +import random +import numpy as np +import six.moves.cPickle as pickle +from paddle.trainer.PyDataProvider2 import * + +def remove_unk(x, n_words): + return [[1 if w >= n_words else w for w in sen] for sen in x] + +# ============================================================== +# tensorflow uses fixed length, but PaddlePaddle can process +# variable-length. Padding is used in benchmark in order to +# compare with other platform. +# ============================================================== +def pad_sequences(sequences, maxlen=None, dtype='int32', padding='post', + truncating='post', value=0.): + lengths = [len(s) for s in sequences] + + nb_samples = len(sequences) + if maxlen is None: + maxlen = np.max(lengths) + + x = (np.ones((nb_samples, maxlen)) * value).astype(dtype) + for idx, s in enumerate(sequences): + if len(s) == 0: + continue # empty list was found + if truncating == 'pre': + trunc = s[-maxlen:] + elif truncating == 'post': + trunc = s[:maxlen] + else: + raise ValueError("Truncating type '%s' not understood" % padding) + + if padding == 'post': + x[idx, :len(trunc)] = trunc + elif padding == 'pre': + x[idx, -len(trunc):] = trunc + else: + raise ValueError("Padding type '%s' not understood" % padding) + return x + + +def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs): + settings.vocab_size = vocab_size + settings.pad_seq = pad_seq + settings.maxlen = maxlen + settings.input_types = [ + integer_value_sequence(vocab_size), + integer_value(2)] + +@provider(init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM) +def process(settings, file): + f = open(file, 'rb') + train_set = pickle.load(f) + f.close() + x, y = train_set + + # remove unk, namely remove the words out of dictionary + x = remove_unk(x, settings.vocab_size) + if settings.pad_seq: + x = pad_sequences(x, maxlen=settings.maxlen, value=0.) + + for i in range(len(y)): + yield map(int,x[i]), int(y[i]) diff --git a/benchmark/paddle/rnn/rnn.py b/benchmark/paddle/rnn/rnn.py new file mode 100755 index 0000000000000000000000000000000000000000..fc8221b1126649d3d1b6a2a8743d25fe4a8d4aec --- /dev/null +++ b/benchmark/paddle/rnn/rnn.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +from paddle.trainer_config_helpers import * +import imdb + +num_class = 2 +vocab_size = 30000 +fixedlen = 100 +batch_size = get_config_arg('batch_size', int, 128) +lstm_num = get_config_arg('lstm_num', int, 1) +hidden_size = get_config_arg('hidden_size', int, 128) +# whether to pad sequence into fixed length +pad_seq = get_config_arg('pad_seq', bool, True) +imdb.create_data('imdb.pkl') + +args={'vocab_size':vocab_size, 'pad_seq':pad_seq, 'maxlen':fixedlen} +define_py_data_sources2("train.list", + None, + module="provider", + obj="process", + args=args) + +settings( + batch_size=batch_size, + learning_rate=2e-3, + learning_method=AdamOptimizer(), + regularization=L2Regularization(8e-4), + gradient_clipping_threshold=25 +) + +net = data_layer('data', size=vocab_size) +net = embedding_layer(input=net, size=128) + +for i in xrange(lstm_num): + net = simple_lstm(input=net, size=hidden_size) + +net = last_seq(input=net) +net = fc_layer(input=net, size=2, act=SoftmaxActivation()) + +lab = data_layer('label', num_class) +loss = classification_cost(input=net, label=lab) +outputs(loss) diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..92c6e0b4b4227bf5de8bf3682b0d86c98d3f2f2b --- /dev/null +++ b/benchmark/paddle/rnn/run.sh @@ -0,0 +1,38 @@ +set -e + +function train() { + cfg=$1 + thread=$2 + args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}" + paddle train --job=time \ + --config=$cfg \ + --use_gpu=1 \ + --trainer_count=$thread \ + --log_period=10 \ + --test_period=100 \ + --num_passes=1 \ + --feed_data=1 \ + --config_args=$args \ + >logs/rnn-pad${4}-${thread}gpu-lstm${3}-batch${6}-hid${5}.log 2>&1 +} + +if [ ! -d "logs" ]; then + mkdir logs +fi + +## padding, single gpu +#-----config--gpu--lstm_num--padding--hidden_size--batch_size +## lstm_num=2, batch_size=64 +train rnn.py 1 2 1 256 64 +train rnn.py 1 2 1 512 64 +train rnn.py 1 2 1 1280 64 + +## lstm_num=2, batch_size=128 +train rnn.py 1 2 1 256 128 +train rnn.py 1 2 1 512 128 +train rnn.py 1 2 1 1280 128 + +## lstm_num=4, batch_size=256 +train rnn.py 1 2 1 256 256 +train rnn.py 1 2 1 512 256 +train rnn.py 1 2 1 1280 256 diff --git a/benchmark/paddle/rnn/run_multi.sh b/benchmark/paddle/rnn/run_multi.sh new file mode 100755 index 0000000000000000000000000000000000000000..50ee469bcd98b75ee1b45078153456e6af814279 --- /dev/null +++ b/benchmark/paddle/rnn/run_multi.sh @@ -0,0 +1,34 @@ +set -e + +function train() { + cfg=$1 + thread=$2 + args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}" + paddle train --job=time \ + --config=$cfg \ + --use_gpu=1 \ + --trainer_count=$thread \ + --log_period=10 \ + --test_period=100 \ + --num_passes=1 \ + --feed_data=1 \ + --config_args=$args \ + >logs/rnn-pad${4}-${thread}gpu-lstm${3}-hid${5}-batch${6}.log 2>&1 +} + + +if [ ! -d "logs" ]; then + mkdir logs +fi + +#-----config--gpu--lstm_num--padding--hidden_size--batch_size +#==================multi gpus=====================# +# hidden_size=256, lstm_num=2, different batch size +train rnn.py 4 2 1 256 128 +train rnn.py 4 2 1 256 256 +train rnn.py 4 2 1 256 512 + +# hidden_size=512, lstm_num=4, different batch size +train rnn.py 4 2 1 512 128 +train rnn.py 4 2 1 512 256 +train rnn.py 4 2 1 512 512 diff --git a/benchmark/tensorflow/image/alexnet.py b/benchmark/tensorflow/image/alexnet.py new file mode 100644 index 0000000000000000000000000000000000000000..57b7ef6c323243c8e03324533d0022ab00bb8516 --- /dev/null +++ b/benchmark/tensorflow/image/alexnet.py @@ -0,0 +1,260 @@ +from six.moves import xrange # pylint: disable=redefined-builtin +from datetime import datetime +import math +import time + +import tensorflow.python.platform +import tensorflow as tf + +FLAGS = tf.app.flags.FLAGS + +tf.app.flags.DEFINE_integer('batch_size', 128, + """Batch size.""") +tf.app.flags.DEFINE_integer('num_batches', 100, + """Number of batches to run.""") +tf.app.flags.DEFINE_boolean('forward_only', False, + """Only run the forward pass.""") +tf.app.flags.DEFINE_boolean('forward_backward_only', False, + """Only run the forward-forward pass.""") +tf.app.flags.DEFINE_string('data_format', 'NCHW', + """The data format for Convnet operations. + Can be either NHWC or NCHW. + """) +tf.app.flags.DEFINE_boolean('log_device_placement', False, + """Whether to log device placement.""") + +def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005): + with tf.name_scope(name) as scope: + kernel = tf.get_variable(name + '_w',[kH, kW, nIn, nOut], + initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + + if wd is not None and wd > 0: + weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') + tf.add_to_collection('losses', weight_decay) + + if FLAGS.data_format == 'NCHW': + strides = [1, 1, dH, dW] + else: + strides = [1, dH, dW, 1] + conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType, + data_format=FLAGS.data_format) + + biases = tf.get_variable(name=name + '_b', shape=[nOut], + initializer=tf.constant_initializer(value=0.0, dtype=tf.float32), + dtype=tf.float32) + + bias = tf.reshape( + tf.nn.bias_add(conv, biases, data_format=FLAGS.data_format), + conv.get_shape()) + + conv1 = tf.nn.relu(bias, name=scope) + return conv1 + +def _affine(name, inpOp, nIn, nOut, wd=0.0005, act=True, drop=None): + with tf.name_scope(name) as scope: + kernel = tf.get_variable(name + '_w', [nIn, nOut], + initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + + if wd is not None and wd > 0: + weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') + tf.add_to_collection('losses', weight_decay) + + biases = tf.get_variable(name + '_b', [nOut], + initializer=tf.constant_initializer(value=0.0, dtype=tf.float32), + dtype=tf.float32,trainable=True) + + affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \ + tf.matmul(inpOp, kernel) + biases + + output = tf.nn.dropout(affine1, drop) if drop else affine1 + + return output + +def _mpool(name, inpOp, kH, kW, dH, dW): + if FLAGS.data_format == 'NCHW': + ksize = [1, 1, kH, kW] + strides = [1, 1, dH, dW] + else: + ksize = [1, kH, kW, 1] + strides = [1, dH, dW, 1] + return tf.nn.max_pool(inpOp, + ksize=ksize, + strides=strides, + padding='VALID', + data_format=FLAGS.data_format, + name=name) + +def _norm(name, l_input, lsize=4): + return tf.nn.lrn(l_input, lsize, bias=1.0, + alpha=0.001 / 9.0, + beta=0.75, name=name) + + + +def loss(logits, labels): + labels = tf.cast(labels, tf.int64) + cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits, labels, name='cross_entropy_per_example') + cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') + tf.add_to_collection('losses', cross_entropy_mean) + + # The total loss is defined as the cross entropy loss plus all of the weight + # decay terms (L2 loss). + return tf.add_n(tf.get_collection('losses'), name='total_loss') + +def get_incoming_shape(incoming): + """ Returns the incoming data shape """ + if isinstance(incoming, tf.Tensor): + return incoming.get_shape().as_list() + elif type(incoming) in [np.array, list, tuple]: + return np.shape(incoming) + else: + raise Exception("Invalid incoming layer.") + +def inference(images): + conv1 = _conv ('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID') + pool1 = _mpool('pool1', conv1, 3, 3, 2, 2) + norm1 = _norm ('norm1', pool1, lsize=5) + conv2 = _conv ('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME') + pool2 = _mpool('pool2', conv2, 3, 3, 2, 2) + norm2 = _norm ('norm2', pool2, lsize=5) + conv3 = _conv ('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME') + conv4 = _conv ('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME') + conv5 = _conv ('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME') + pool5 = _mpool('pool5', conv5, 3, 3, 2, 2) + resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6]) + affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096, 0.5) + affn2 = _affine('fc7', affn1, 4096, 4096, 0.5) + affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False) # last fc + + return affn3 + + +def time_tensorflow_run(session, target, info_string): + num_steps_burn_in = 10 + total_duration = 0.0 + total_duration_squared = 0.0 + if not isinstance(target, list): + target = [target] + target_op = tf.group(*target) + for i in xrange(FLAGS.num_batches + num_steps_burn_in): + start_time = time.time() + _ = session.run(target_op) + duration = time.time() - start_time + if i > num_steps_burn_in: + if not i % 10: + print ('%s: step %d, duration = %.3f' % + (datetime.now(), i - num_steps_burn_in, duration)) + total_duration += duration + total_duration_squared += duration * duration + mn = total_duration / FLAGS.num_batches + vr = total_duration_squared / FLAGS.num_batches - mn * mn + sd = math.sqrt(vr) + print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' % + (datetime.now(), info_string, FLAGS.num_batches, mn, sd)) + +def _add_loss_summaries(total_loss): + """ + Generates moving average for all losses and associated summaries for + visualizing the performance of the network. + + Args: + total_loss: Total loss from loss(). + Returns: + loss_averages_op: op for generating moving averages of losses. + """ + # Compute the moving average of all individual losses and the total loss. + loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') + losses = tf.get_collection('losses') + loss_averages_op = loss_averages.apply(losses + [total_loss]) + + # Attach a scalar summary to all individual losses and the total loss; do the + # same for the averaged version of the losses. + for l in losses + [total_loss]: + # Name each loss as '(raw)' and name the moving average version of the loss + # as the original loss name. + tf.scalar_summary(l.op.name +' (raw)', l) + tf.scalar_summary(l.op.name, loss_averages.average(l)) + + return loss_averages_op + + + +def run_benchmark(): + with tf.Graph().as_default(): + with tf.device('/gpu:0'): + # Generate some dummy images. + image_size = 224 + # Note that our padding definition is slightly different the cuda-convnet. + # In order to force the model to start with the same activations sizes, + # we add 3 to the image_size and employ VALID padding above. + if FLAGS.data_format == 'NCHW': + image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3] + else: + image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3] + images = tf.get_variable('image', image_shape, + initializer=tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32), + dtype=tf.float32, + trainable=False) + + labels = tf.get_variable('label', [FLAGS.batch_size], + initializer=tf.constant_initializer(1), + dtype=tf.int32, + trainable=False) + + # Build a Graph that computes the logits predictions from the + # inference model. + last_layer = inference(images) + + objective = loss(last_layer, labels) + # Compute the gradient with respect to all the parameters. + + # Compute gradients. + # opt = tf.train.GradientDescentOptimizer(0.001) + opt = tf.train.MomentumOptimizer(0.001, 0.9) + grads = opt.compute_gradients(objective) + global_step = tf.get_variable('global_step', [], + initializer=tf.constant_initializer(0.0, dtype=tf.float32), + trainable=False, dtype=tf.float32) + apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) + + # Track the moving averages of all trainable variables. + variable_averages = tf.train.ExponentialMovingAverage( + 0.9, global_step) + variables_averages_op = variable_averages.apply(tf.trainable_variables()) + + # Build an initialization operation. + init = tf.initialize_all_variables() + + # Start running operations on the Graph. + sess = tf.Session(config=tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=FLAGS.log_device_placement)) + sess.run(init) + + run_forward = True + run_forward_backward = True + if FLAGS.forward_only and FLAGS.forward_backward_only: + raise ValueError("Cannot specify --forward_only and " + "--forward_backward_only at the same time.") + if FLAGS.forward_only: + run_forward_backward = False + elif FLAGS.forward_backward_only: + run_forward = False + + if run_forward: + time_tensorflow_run(sess, last_layer, "Forward") + + if run_forward_backward: + with tf.control_dependencies([apply_gradient_op, variables_averages_op]): + train_op = tf.no_op(name='train') + time_tensorflow_run(sess, [train_op, objective], "Forward-backward") + +def main(_): + run_benchmark() + + +if __name__ == '__main__': + tf.app.run() diff --git a/benchmark/tensorflow/image/alexnet_multi_gpu.py b/benchmark/tensorflow/image/alexnet_multi_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..949ad77f3b862bc82ec9e5d663e41a405bf272ce --- /dev/null +++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py @@ -0,0 +1,335 @@ +from six.moves import xrange # pylint: disable=redefined-builtin +from datetime import datetime +import math +import re +import time + +import tensorflow.python.platform +import tensorflow as tf + +FLAGS = tf.app.flags.FLAGS + +tf.app.flags.DEFINE_integer('batch_size', 64, + """Batch size.""") +tf.app.flags.DEFINE_integer('num_batches', 100, + """Number of batches to run.""") +tf.app.flags.DEFINE_string('data_format', 'NCHW', + """The data format for Convnet operations. + Can be either NHWC or NCHW. + """) + +tf.app.flags.DEFINE_string('train_dir', '/train_model', + """Directory where to write event logs """ + """and checkpoint.""") +tf.app.flags.DEFINE_integer('num_gpus', 4, + """How many GPUs to use.""") +tf.app.flags.DEFINE_boolean('log_device_placement', False, + """Whether to log device placement.""") + +NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN=50000 +NUM_EPOCHS_PER_DECAY=50 +INITIAL_LEARNING_RATE = 0.1 +LEARNING_RATE_DECAY_FACTOR = 0.1 +TOWER_NAME = 'tower' + + +def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005): + with tf.name_scope(name) as scope: + kernel = tf.get_variable(name + '_w',[kH, kW, nIn, nOut], + initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + + if wd is not None: + weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') + tf.add_to_collection('losses', weight_decay) + + if FLAGS.data_format == 'NCHW': + strides = [1, 1, dH, dW] + else: + strides = [1, dH, dW, 1] + conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType, + data_format=FLAGS.data_format) + + biases = tf.get_variable(name=name + '_b', shape=[nOut], + initializer=tf.constant_initializer(value=0.0, dtype=tf.float32), + dtype=tf.float32) + + bias = tf.reshape( + tf.nn.bias_add(conv, biases, data_format=FLAGS.data_format), + conv.get_shape()) + + conv1 = tf.nn.relu(bias, name=scope) + return conv1 + +def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True): + with tf.name_scope(name) as scope: + kernel = tf.get_variable(name + '_w', [nIn, nOut], + initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + + if wd is not None: + weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') + tf.add_to_collection('losses', weight_decay) + + biases = tf.get_variable(name + '_b', [nOut], + initializer=tf.constant_initializer(value=0.0, dtype=tf.float32), + dtype=tf.float32,trainable=True) + + affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \ + tf.matmul(inpOp, kernel) + biases + + return affine1 + +def _mpool(name, inpOp, kH, kW, dH, dW): + if FLAGS.data_format == 'NCHW': + ksize = [1, 1, kH, kW] + strides = [1, 1, dH, dW] + else: + ksize = [1, kH, kW, 1] + strides = [1, dH, dW, 1] + return tf.nn.max_pool(inpOp, + ksize=ksize, + strides=strides, + padding='VALID', + data_format=FLAGS.data_format, + name=name) + +def _norm(name, l_input, lsize=4): + return tf.nn.lrn(l_input, lsize, bias=1.0, + alpha=0.001 / 9.0, + beta=0.75, name=name) + +def loss(logits, labels): + labels = tf.cast(labels, tf.int64) + cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits, labels, name='cross_entropy_per_example') + cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') + tf.add_to_collection('losses', cross_entropy_mean) + + # The total loss is defined as the cross entropy loss plus all of the weight + # decay terms (L2 loss). + return tf.add_n(tf.get_collection('losses'), name='total_loss') + + +def get_incoming_shape(incoming): + """ Returns the incoming data shape """ + if isinstance(incoming, tf.Tensor): + return incoming.get_shape().as_list() + elif type(incoming) in [np.array, list, tuple]: + return np.shape(incoming) + else: + raise Exception("Invalid incoming layer.") + +def inference(images): + conv1 = _conv ('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID') + pool1 = _mpool('pool1', conv1, 3, 3, 2, 2) + norm1 = _norm ('norm1', pool1, lsize=5) + conv2 = _conv ('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME') + pool2 = _mpool('pool2', conv2, 3, 3, 2, 2) + norm2 = _norm ('norm2', pool2, lsize=5) + conv3 = _conv ('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME') + conv4 = _conv ('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME') + conv5 = _conv ('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME') + pool5 = _mpool('pool5', conv5, 3, 3, 2, 2) + resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6]) + affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096) + affn2 = _affine('fc7', affn1, 4096, 4096) + affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False) # last fc + + return affn3 + +def tower_loss(scope): + """Calculate the total loss on a single tower running the model. + Args: + scope: unique prefix string identifying the tower, e.g. 'tower_0' + Returns: + Tensor of shape [] containing the total loss for a batch of data + """ + image_size = 224 + if FLAGS.data_format == 'NCHW': + image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3] + else: + image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3] + images = tf.get_variable('image', image_shape, + initializer=tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32), + dtype=tf.float32, + trainable=False) + + labels = tf.get_variable('label', [FLAGS.batch_size], + initializer=tf.constant_initializer(1), + dtype=tf.int32, + trainable=False) + + # Build a Graph that computes the logits predictions from the + # inference model. + last_layer = inference(images) + + # Build the portion of the Graph calculating the losses. Note that we will + # assemble the total_loss using a custom function below. + _ = loss(last_layer, labels) + + # Assemble all of the losses for the current tower only. + losses = tf.get_collection('losses', scope) + + # Calculate the total loss for the current tower. + total_loss = tf.add_n(losses, name='total_loss') + + # Compute the moving average of all individual losses and the total loss. + loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') + loss_averages_op = loss_averages.apply(losses + [total_loss]) + + # Attach a scalar summary to all individual losses and the total loss; do the + # same for the averaged version of the losses. + for l in losses + [total_loss]: + # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training + # session. This helps the clarity of presentation on tensorboard. + loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name) + # Name each loss as '(raw)' and name the moving average version of the loss + # as the original loss name. + tf.scalar_summary(loss_name +' (raw)', l) + tf.scalar_summary(loss_name, loss_averages.average(l)) + + with tf.control_dependencies([loss_averages_op]): + total_loss = tf.identity(total_loss) + return total_loss + + +def average_gradients(tower_grads): + """Calculate the average gradient for each shared variable across all towers. + Note that this function provides a synchronization point across all towers. + Args: + tower_grads: List of lists of (gradient, variable) tuples. The outer list + is over individual gradients. The inner list is over the gradient + calculation for each tower. + Returns: + List of pairs of (gradient, variable) where the gradient has been averaged + across all towers. + """ + average_grads = [] + for grad_and_vars in zip(*tower_grads): + # Note that each grad_and_vars looks like the following: + # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) + grads = [] + for g, _ in grad_and_vars: + # Add 0 dimension to the gradients to represent the tower. + expanded_g = tf.expand_dims(g, 0) + + # Append on a 'tower' dimension which we will average over below. + grads.append(expanded_g) + + # Average over the 'tower' dimension. + grad = tf.concat(0, grads) + grad = tf.reduce_mean(grad, 0) + + # Keep in mind that the Variables are redundant because they are shared + # across towers. So .. we will just return the first tower's pointer to + # the Variable. + v = grad_and_vars[0][1] + grad_and_var = (grad, v) + average_grads.append(grad_and_var) + return average_grads + +def time_tensorflow_run(session, target): + num_steps_burn_in = 50 + total_duration = 0.0 + total_duration_squared = 0.0 + for i in xrange(FLAGS.num_batches + num_steps_burn_in): + start_time = time.time() + _, loss_value = session.run(target) + duration = time.time() - start_time + if i > num_steps_burn_in: + if not i % 10: + num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus + examples_per_sec = num_examples_per_step / duration + sec_per_batch = duration + + format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' + 'sec/batch batch_size = %d)') + print (format_str % + (datetime.now(), i - num_steps_burn_in, + loss_value, duration, sec_per_batch, num_examples_per_step)) + + total_duration += duration + total_duration_squared += duration * duration + + mn = total_duration / FLAGS.num_batches + vr = total_duration_squared / FLAGS.num_batches - mn * mn + sd = math.sqrt(vr) + print ('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' % + (datetime.now(), FLAGS.num_batches, mn, sd)) + +def run_benchmark(): + with tf.Graph().as_default(), tf.device('/cpu:0'): + # Create a variable to count the number of train() calls. This equals the + # number of batches processed * FLAGS.num_gpus. + global_step = tf.get_variable( + 'global_step', [], + initializer=tf.constant_initializer(0), trainable=False) + + # Calculate the learning rate schedule. + num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / + FLAGS.batch_size) + decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) + + # Decay the learning rate exponentially based on the number of steps. + lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, + global_step, + decay_steps, + LEARNING_RATE_DECAY_FACTOR, + staircase=True) + + # Create an optimizer that performs gradient descent. + # opt = tf.train.GradientDescentOptimizer(lr) + opt = tf.train.MomentumOptimizer(lr, 0.9) + + # Calculate the gradients for each model tower. + tower_grads = [] + for i in xrange(FLAGS.num_gpus): + with tf.device('/gpu:%d' % i): + with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: + # Calculate the loss for one tower of the model. This function + # constructs the entire model but shares the variables across + # all towers. + loss = tower_loss(scope) + + # Reuse variables for the next tower. + tf.get_variable_scope().reuse_variables() + + # Retain the summaries from the final tower. + summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) + + # Calculate the gradients for the batch of data on this tower. + grads = opt.compute_gradients(loss) + + # Keep track of the gradients across all towers. + tower_grads.append(grads) + + # We must calculate the mean of each gradient. Note that this is the + # synchronization point across all towers. + grads = average_gradients(tower_grads) + + # Apply the gradients to adjust the shared variables. + apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) + + # Group all updates to into a single train op. + train_op = tf.group(apply_gradient_op) + + # Build an initialization operation. + init = tf.initialize_all_variables() + + # Start running operations on the Graph. allow_soft_placement must be set to + # True to build towers on GPU, as some of the ops do not have GPU + # implementations. + sess = tf.Session(config=tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=FLAGS.log_device_placement)) + sess.run(init) + time_tensorflow_run(sess, [train_op, loss]) + + +def main(_): + run_benchmark() + + +if __name__ == '__main__': + tf.app.run() diff --git a/benchmark/tensorflow/image/googlenet.py b/benchmark/tensorflow/image/googlenet.py new file mode 100644 index 0000000000000000000000000000000000000000..097a8997b78ff55813897b7f32c4d7d931e8288d --- /dev/null +++ b/benchmark/tensorflow/image/googlenet.py @@ -0,0 +1,282 @@ +from six.moves import xrange +from datetime import datetime +import math +import time + +import tensorflow.python.platform +import tensorflow as tf + +FLAGS = tf.app.flags.FLAGS + +tf.app.flags.DEFINE_integer('batch_size', 128, + """Batch size.""") +tf.app.flags.DEFINE_integer('num_batches', 100, + """Number of batches to run.""") +tf.app.flags.DEFINE_boolean('forward_only', False, + """Only run the forward pass.""") +tf.app.flags.DEFINE_boolean('forward_backward_only', False, + """Only run the forward-forward pass.""") +tf.app.flags.DEFINE_string('data_format', 'NCHW', + """The data format for Convnet operations. + Can be either NHWC or NCHW. + """) +tf.app.flags.DEFINE_boolean('log_device_placement', False, + """Whether to log device placement.""") + +parameters = [] + +conv_counter = 1 +pool_counter = 1 +affine_counter = 1 + +def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd = 0.0005): + global conv_counter + global parameters + name = 'conv' + str(conv_counter) + conv_counter += 1 + with tf.name_scope(name) as scope: + kernel = tf.Variable(tf.truncated_normal([kH, kW, nIn, nOut], + dtype=tf.float32, + stddev=1e-1), name='weights') + + if wd is not None and wd > 0: + weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') + tf.add_to_collection('losses', weight_decay) + + if FLAGS.data_format == 'NCHW': + strides = [1, 1, dH, dW] + else: + strides = [1, dH, dW, 1] + conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType, + data_format=FLAGS.data_format) + biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32), + trainable=True, name='biases') + bias = tf.reshape(tf.nn.bias_add(conv, biases, + data_format=FLAGS.data_format), + conv.get_shape()) + conv1 = tf.nn.relu(bias, name=scope) + parameters += [kernel, biases] + return conv1 + +def _affine(inpOp, nIn, nOut, act=True, wd = 0.0005): + global affine_counter + global parameters + name = 'affine' + str(affine_counter) + affine_counter += 1 + with tf.name_scope(name) as scope: + kernel = tf.Variable(tf.truncated_normal([nIn, nOut], + dtype=tf.float32, + stddev=1e-1), name='weights') + + if wd is not None and wd > 0: + weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') + tf.add_to_collection('losses', weight_decay) + + biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32), + trainable=True, name='biases') + affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else tf.matmul(inpOp, kernel) + biases + parameters += [kernel, biases] + return affine1 + +def _mpool(inpOp, kH, kW, dH, dW, padding): + global pool_counter + global parameters + name = 'pool' + str(pool_counter) + pool_counter += 1 + if FLAGS.data_format == 'NCHW': + ksize = [1, 1, kH, kW] + strides = [1, 1, dH, dW] + else: + ksize = [1, kH, kW, 1] + strides = [1, dH, dW, 1] + return tf.nn.max_pool(inpOp, + ksize=ksize, + strides=strides, + padding=padding, + data_format=FLAGS.data_format, + name=name) + +def _apool(inpOp, kH, kW, dH, dW, padding): + global pool_counter + global parameters + name = 'pool' + str(pool_counter) + pool_counter += 1 + if FLAGS.data_format == 'NCHW': + ksize = [1, 1, kH, kW] + strides = [1, 1, dH, dW] + else: + ksize = [1, kH, kW, 1] + strides = [1, dH, dW, 1] + return tf.nn.avg_pool(inpOp, + ksize=ksize, + strides=strides, + padding=padding, + data_format=FLAGS.data_format, + name=name) + +def _inception(inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2): + conv1 = _conv(inp, inSize, o1s, 1, 1, 1, 1, 'VALID') + + conv3_ = _conv(inp, inSize, o2s1, 1, 1, 1, 1, 'VALID') + conv3 = _conv(conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME') + + conv5_ = _conv(inp, inSize, o3s1, 1, 1, 1, 1, 'VALID') + conv5 = _conv(conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME') + + pool_ = _mpool(inp, o4s1, o4s1, 1, 1, 'SAME') + pool = _conv(pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID') + + if FLAGS.data_format == 'NCHW': + channel_dim = 1 + else: + channel_dim = 3 + incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool]) + return incept + + +def loss(logits, labels): + batch_size = tf.size(labels) + labels = tf.expand_dims(labels, 1) + indices = tf.expand_dims(tf.range(0, batch_size, 1), 1) + concated = tf.concat(1, [indices, labels]) + onehot_labels = tf.sparse_to_dense( + concated, tf.pack([batch_size, 1000]), 1.0, 0.0) + cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, + onehot_labels, + name='xentropy') + loss = tf.reduce_mean(cross_entropy, name='xentropy_mean') + return loss + +def inference(images): + # stage 1 + conv1 = _conv (images, 3, 64, 7, 7, 2, 2, 'SAME') + pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME') + # stage 2 + conv2 = _conv (pool1, 64, 64, 1, 1, 1, 1, 'VALID') + conv3 = _conv (conv2, 64, 192, 3, 3, 1, 1, 'SAME') + pool3 = _mpool(conv3, 3, 3, 2, 2, 'SAME') + + # stage 3 + incept3a = _inception(pool3, 192, 64, 96, 128, 16, 32, 3, 32) + incept3b = _inception(incept3a, 256, 128, 128, 192, 32, 96, 3, 64) + pool4 = _mpool(incept3b, 3, 3, 2, 2, 'SAME') + + # stage 4 + incept4a = _inception(pool4, 480, 192, 96, 208, 16, 48, 3, 64) + incept4b = _inception(incept4a, 512, 160, 112, 224, 24, 64, 3, 64) + incept4c = _inception(incept4b, 512, 128, 128, 256, 24, 64, 3, 64) + incept4d = _inception(incept4c, 512, 112, 144, 288, 32, 64, 3, 64) + incept4e = _inception(incept4d, 528, 256, 160, 320, 32, 128, 3, 128) + pool5 = _mpool(incept4e, 3, 3, 2, 2, 'SAME') + + # stage 5 + incept5a = _inception(pool5, 832, 256, 160, 320, 32, 128, 3, 128) + incept5b = _inception(incept5a, 832, 384, 192, 384, 48, 128, 3, 128) + pool6 = _apool(incept5b, 7, 7, 1, 1, 'VALID') + + # output 1 + resh1 = tf.reshape(pool6, [-1, 1024]) + drop = tf.nn.dropout(resh1, 0.4) + affn1 = _affine(resh1, 1024, 1000, act=False) + + return affn1 + + +def time_tensorflow_run(session, target, info_string): + num_steps_burn_in = 10 + total_duration = 0.0 + total_duration_squared = 0.0 + if not isinstance(target, list): + target = [target] + target_op = tf.group(*target) + for i in range(FLAGS.num_batches + num_steps_burn_in): + start_time = time.time() + _ = session.run(target_op) + duration = time.time() - start_time + if i > num_steps_burn_in: + if not i % 10: + print ('%s: step %d, duration = %.3f' % + (datetime.now(), i - num_steps_burn_in, duration)) + total_duration += duration + total_duration_squared += duration * duration + mn = total_duration / FLAGS.num_batches + vr = total_duration_squared / FLAGS.num_batches - mn * mn + sd = math.sqrt(vr) + print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' % + (datetime.now(), info_string, FLAGS.num_batches, mn, sd)) + +def run_benchmark(): + global parameters + with tf.Graph().as_default(): + # Generate some dummy images. + image_size = 224 + if FLAGS.data_format == 'NCHW': + image_shape = [FLAGS.batch_size, 3, image_size, image_size] + else: + image_shape = [FLAGS.batch_size, image_size, image_size, 3] + + images = tf.get_variable('image', image_shape, + initializer=tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32), + dtype=tf.float32, + trainable=False) + + labels = tf.get_variable('label', [FLAGS.batch_size], + initializer=tf.constant_initializer(1), + dtype=tf.int32, + trainable=False) + + # Build a Graph that computes the logits predictions from the + # inference model. + last_layer = inference(images) + + objective = loss(last_layer, labels) + + # Compute gradients. + # opt = tf.train.GradientDescentOptimizer(0.001) + opt = tf.train.MomentumOptimizer(0.001, 0.9) + grads = opt.compute_gradients(objective) + global_step = tf.get_variable('global_step', [], + initializer=tf.constant_initializer(0.0, dtype=tf.float32), + trainable=False, dtype=tf.float32) + apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) + + # Track the moving averages of all trainable variables. + variable_averages = tf.train.ExponentialMovingAverage( + 0.9, global_step) + variables_averages_op = variable_averages.apply(tf.trainable_variables()) + + # Build an initialization operation. + init = tf.initialize_all_variables() + + # Start running operations on the Graph. + sess = tf.Session(config=tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=FLAGS.log_device_placement)) + sess.run(init) + + run_forward = True + run_forward_backward = True + if FLAGS.forward_only and FLAGS.forward_backward_only: + raise ValueError("Cannot specify --forward_only and " + "--forward_backward_only at the same time.") + if FLAGS.forward_only: + run_forward_backward = False + elif FLAGS.forward_backward_only: + run_forward = False + + if run_forward: + # Run the forward benchmark. + time_tensorflow_run(sess, last_layer, "Forward") + + if run_forward_backward: + with tf.control_dependencies([apply_gradient_op, variables_averages_op]): + train_op = tf.no_op(name='train') + time_tensorflow_run(sess, [train_op, objective], "Forward-backward") + + +def main(_): + run_benchmark() + + +if __name__ == '__main__': + tf.app.run() diff --git a/benchmark/tensorflow/image/googlenet_multi_gpu.py b/benchmark/tensorflow/image/googlenet_multi_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..e22a6b6253eedcbc2680309a29de10c9dd2bf4ff --- /dev/null +++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py @@ -0,0 +1,381 @@ +from six.moves import xrange # pylint: disable=redefined-builtin +from datetime import datetime +import math +import re +import time + +import tensorflow.python.platform +import tensorflow as tf + +FLAGS = tf.app.flags.FLAGS + +tf.app.flags.DEFINE_integer('batch_size', 64, + """Batch size.""") +tf.app.flags.DEFINE_integer('num_batches', 100, + """Number of batches to run.""") +tf.app.flags.DEFINE_string('data_format', 'NCHW', + """The data format for Convnet operations. + Can be either NHWC or NCHW. + """) + +tf.app.flags.DEFINE_string('train_dir', '/train_model', + """Directory where to write event logs """ + """and checkpoint.""") +tf.app.flags.DEFINE_integer('num_gpus', 4, + """How many GPUs to use.""") +tf.app.flags.DEFINE_boolean('log_device_placement', False, + """Whether to log device placement.""") + +NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN=50000 +NUM_EPOCHS_PER_DECAY=50 +INITIAL_LEARNING_RATE = 0.1 +LEARNING_RATE_DECAY_FACTOR = 0.1 +TOWER_NAME = 'tower' + + +def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005): + with tf.name_scope(name) as scope: + kernel = tf.get_variable(name + '_w',[kH, kW, nIn, nOut], + initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + + if wd is not None: + weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') + tf.add_to_collection('losses', weight_decay) + + if FLAGS.data_format == 'NCHW': + strides = [1, 1, dH, dW] + else: + strides = [1, dH, dW, 1] + conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType, + data_format=FLAGS.data_format) + + biases = tf.get_variable(name=name + '_b', shape=[nOut], + initializer=tf.constant_initializer(value=0.0, dtype=tf.float32), + dtype=tf.float32) + + bias = tf.reshape( + tf.nn.bias_add(conv, biases, data_format=FLAGS.data_format), + conv.get_shape()) + + conv1 = tf.nn.relu(bias, name=scope) + return conv1 + +def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True): + with tf.name_scope(name) as scope: + kernel = tf.get_variable(name + '_w', [nIn, nOut], + initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + + if wd is not None: + weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') + tf.add_to_collection('losses', weight_decay) + + biases = tf.get_variable(name + '_b', [nOut], + initializer=tf.constant_initializer(value=0.0, dtype=tf.float32), + dtype=tf.float32,trainable=True) + + affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \ + tf.matmul(inpOp, kernel) + biases + + return affine1 + +def _mpool(name, inpOp, kH, kW, dH, dW, padding): + if FLAGS.data_format == 'NCHW': + ksize = [1, 1, kH, kW] + strides = [1, 1, dH, dW] + else: + ksize = [1, kH, kW, 1] + strides = [1, dH, dW, 1] + return tf.nn.max_pool(inpOp, + ksize=ksize, + strides=strides, + padding=padding, + data_format=FLAGS.data_format, + name=name) + +def _apool(name, inpOp, kH, kW, dH, dW, padding): + if FLAGS.data_format == 'NCHW': + ksize = [1, 1, kH, kW] + strides = [1, 1, dH, dW] + else: + ksize = [1, kH, kW, 1] + strides = [1, dH, dW, 1] + return tf.nn.avg_pool(inpOp, + ksize=ksize, + strides=strides, + padding=padding, + data_format=FLAGS.data_format, + name=name) + +def loss(logits, labels): + labels = tf.cast(labels, tf.int64) + cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits, labels, name='cross_entropy_per_example') + cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') + tf.add_to_collection('losses', cross_entropy_mean) + + # The total loss is defined as the cross entropy loss plus all of the weight + # decay terms (L2 loss). + return tf.add_n(tf.get_collection('losses'), name='total_loss') + + +def get_incoming_shape(incoming): + """ Returns the incoming data shape """ + if isinstance(incoming, tf.Tensor): + return incoming.get_shape().as_list() + elif type(incoming) in [np.array, list, tuple]: + return np.shape(incoming) + else: + raise Exception("Invalid incoming layer.") + + +def _inception(name, inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2): + conv1 = _conv(name + '_1' , inp, inSize, o1s, 1, 1, 1, 1, 'VALID') + + conv3_ = _conv(name + '_3r', inp, inSize, o2s1, 1, 1, 1, 1, 'VALID') + conv3 = _conv(name + '_3', conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME') + + conv5_ = _conv(name + '_5r', inp, inSize, o3s1, 1, 1, 1, 1, 'VALID') + conv5 = _conv(name + '5', conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME') + + pool_ = _mpool(name + 'pool', inp, o4s1, o4s1, 1, 1, 'SAME') + pool = _conv(name + 'proj', pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID') + + if FLAGS.data_format == 'NCHW': + channel_dim = 1 + else: + channel_dim = 3 + incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool]) + return incept + + +def inference(images): + # stage 1 + conv1 = _conv ('conv1', images, 3, 64, 7, 7, 2, 2, 'SAME') + pool1 = _mpool('pool1', conv1, 3, 3, 2, 2, 'SAME') + + # stage 2 + conv2 = _conv ('conv2', pool1, 64, 64, 1, 1, 1, 1, 'VALID') + conv3 = _conv ('conv3', conv2, 64, 192, 3, 3, 1, 1, 'SAME') + pool3 = _mpool('pool3', conv3, 3, 3, 2, 2, 'SAME') + + # stage 3 + incept3a = _inception('ince3a', pool3, 192, 64, 96, 128, 16, 32, 3, 32) + incept3b = _inception('ince3b', incept3a, 256, 128, 128, 192, 32, 96, 3, 64) + pool4 = _mpool('pool4', incept3b, 3, 3, 2, 2, 'SAME') + + # stage 4 + incept4a = _inception('ince4a', pool4, 480, 192, 96, 208, 16, 48, 3, 64) + incept4b = _inception('ince4b', incept4a, 512, 160, 112, 224, 24, 64, 3, 64) + incept4c = _inception('ince4c', incept4b, 512, 128, 128, 256, 24, 64, 3, 64) + incept4d = _inception('ince4d', incept4c, 512, 112, 144, 288, 32, 64, 3, 64) + incept4e = _inception('ince4e', incept4d, 528, 256, 160, 320, 32, 128, 3, 128) + pool5 = _mpool('pool5', incept4e, 3, 3, 2, 2, 'SAME') + + # stage 5 + incept5a = _inception('ince5a', pool5, 832, 256, 160, 320, 32, 128, 3, 128) + incept5b = _inception('ince5b', incept5a, 832, 384, 192, 384, 48, 128, 3, 128) + pool6 = _apool('pool6', incept5b, 7, 7, 1, 1, 'VALID') + + # output 1 + resh1 = tf.reshape(pool6, [-1, 1024]) + drop = tf.nn.dropout(resh1, 0.4) + affn1 = _affine('fc_out', resh1, 1024, 1000, act=False) + + return affn1 + +def tower_loss(scope): + """Calculate the total loss on a single tower running the model. + Args: + scope: unique prefix string identifying the tower, e.g. 'tower_0' + Returns: + Tensor of shape [] containing the total loss for a batch of data + """ + image_size = 224 + if FLAGS.data_format == 'NCHW': + image_shape = [FLAGS.batch_size, 3, image_size, image_size] + else: + image_shape = [FLAGS.batch_size, image_size, image_size, 3] + images = tf.get_variable('image', image_shape, + initializer=tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32), + dtype=tf.float32, + trainable=False) + + labels = tf.get_variable('label', [FLAGS.batch_size], + initializer=tf.constant_initializer(1), + dtype=tf.int32, + trainable=False) + + # Build a Graph that computes the logits predictions from the + # inference model. + last_layer = inference(images) + + # Build the portion of the Graph calculating the losses. Note that we will + # assemble the total_loss using a custom function below. + _ = loss(last_layer, labels) + + # Assemble all of the losses for the current tower only. + losses = tf.get_collection('losses', scope) + + # Calculate the total loss for the current tower. + total_loss = tf.add_n(losses, name='total_loss') + + # Compute the moving average of all individual losses and the total loss. + loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') + loss_averages_op = loss_averages.apply(losses + [total_loss]) + + # Attach a scalar summary to all individual losses and the total loss; do the + # same for the averaged version of the losses. + for l in losses + [total_loss]: + # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training + # session. This helps the clarity of presentation on tensorboard. + loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name) + # Name each loss as '(raw)' and name the moving average version of the loss + # as the original loss name. + tf.scalar_summary(loss_name +' (raw)', l) + tf.scalar_summary(loss_name, loss_averages.average(l)) + + with tf.control_dependencies([loss_averages_op]): + total_loss = tf.identity(total_loss) + return total_loss + + +def average_gradients(tower_grads): + """Calculate the average gradient for each shared variable across all towers. + Note that this function provides a synchronization point across all towers. + Args: + tower_grads: List of lists of (gradient, variable) tuples. The outer list + is over individual gradients. The inner list is over the gradient + calculation for each tower. + Returns: + List of pairs of (gradient, variable) where the gradient has been averaged + across all towers. + """ + average_grads = [] + for grad_and_vars in zip(*tower_grads): + # Note that each grad_and_vars looks like the following: + # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) + grads = [] + for g, _ in grad_and_vars: + # Add 0 dimension to the gradients to represent the tower. + expanded_g = tf.expand_dims(g, 0) + + # Append on a 'tower' dimension which we will average over below. + grads.append(expanded_g) + + # Average over the 'tower' dimension. + grad = tf.concat(0, grads) + grad = tf.reduce_mean(grad, 0) + + # Keep in mind that the Variables are redundant because they are shared + # across towers. So .. we will just return the first tower's pointer to + # the Variable. + v = grad_and_vars[0][1] + grad_and_var = (grad, v) + average_grads.append(grad_and_var) + return average_grads + +def time_tensorflow_run(session, target): + num_steps_burn_in = 50 + total_duration = 0.0 + total_duration_squared = 0.0 + for i in xrange(FLAGS.num_batches + num_steps_burn_in): + start_time = time.time() + _, loss_value = session.run(target) + duration = time.time() - start_time + if i > num_steps_burn_in: + if not i % 10: + num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus + examples_per_sec = num_examples_per_step / duration + sec_per_batch = duration + + format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' + 'sec/batch batch_size = %d)') + print (format_str % + (datetime.now(), i - num_steps_burn_in, + loss_value, duration, sec_per_batch, num_examples_per_step)) + + total_duration += duration + total_duration_squared += duration * duration + + mn = total_duration / FLAGS.num_batches + vr = total_duration_squared / FLAGS.num_batches - mn * mn + sd = math.sqrt(vr) + print ('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' % + (datetime.now(), FLAGS.num_batches, mn, sd)) + +def run_benchmark(): + with tf.Graph().as_default(), tf.device('/cpu:0'): + # Create a variable to count the number of train() calls. This equals the + # number of batches processed * FLAGS.num_gpus. + global_step = tf.get_variable( + 'global_step', [], + initializer=tf.constant_initializer(0), trainable=False) + + # Calculate the learning rate schedule. + num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / + FLAGS.batch_size) + decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) + + # Decay the learning rate exponentially based on the number of steps. + lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, + global_step, + decay_steps, + LEARNING_RATE_DECAY_FACTOR, + staircase=True) + + # Create an optimizer that performs gradient descent. + opt = tf.train.MomentumOptimizer(lr, 0.9) + + # Calculate the gradients for each model tower. + tower_grads = [] + for i in xrange(FLAGS.num_gpus): + with tf.device('/gpu:%d' % i): + with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: + # Calculate the loss for one tower of the model. This function + # constructs the entire model but shares the variables across + # all towers. + loss = tower_loss(scope) + + # Reuse variables for the next tower. + tf.get_variable_scope().reuse_variables() + + # Retain the summaries from the final tower. + summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) + + # Calculate the gradients for the batch of data on this tower. + grads = opt.compute_gradients(loss) + + # Keep track of the gradients across all towers. + tower_grads.append(grads) + + # We must calculate the mean of each gradient. Note that this is the + # synchronization point across all towers. + grads = average_gradients(tower_grads) + + # Apply the gradients to adjust the shared variables. + apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) + + # Group all updates to into a single train op. + train_op = tf.group(apply_gradient_op) + + # Build an initialization operation. + init = tf.initialize_all_variables() + + # Start running operations on the Graph. allow_soft_placement must be set to + # True to build towers on GPU, as some of the ops do not have GPU + # implementations. + sess = tf.Session(config=tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=FLAGS.log_device_placement)) + sess.run(init) + time_tensorflow_run(sess, [train_op, loss]) + + +def main(_): + run_benchmark() + + +if __name__ == '__main__': + tf.app.run() diff --git a/benchmark/tensorflow/image/run.sh b/benchmark/tensorflow/image/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..eade36beb9df5f8d3978939216e058203e024c1a --- /dev/null +++ b/benchmark/tensorflow/image/run.sh @@ -0,0 +1,28 @@ +set -e + +function test() { + cfg=$1 + batch_size=$2 + prefix=$3 + python $cfg --batch_size=$batch_size > logs/${prefix}-1gpu-${batch_size}.log 2>&1 +} + +if [ ! -d "logs" ]; then + mkdir logs +fi + +# alexnet +test alexnet.py 64 alexnet +test alexnet.py 128 alexnet +test alexnet.py 256 alexnet +test alexnet.py 512 alexnet + +# googlenet +test googlenet.py 64 googlenet +test googlenet.py 128 googlenet + +# smallnet +test smallnet_mnist_cifar.py 64 smallnet +test smallnet_mnist_cifar.py 128 smallnet +test smallnet_mnist_cifar.py 256 smallnet +test smallnet_mnist_cifar.py 512 smallnet diff --git a/benchmark/tensorflow/image/run_multi.sh b/benchmark/tensorflow/image/run_multi.sh new file mode 100755 index 0000000000000000000000000000000000000000..69faa4331744f2276e7706185ae10bc507f95764 --- /dev/null +++ b/benchmark/tensorflow/image/run_multi.sh @@ -0,0 +1,22 @@ +set -e + +function test() { + cfg=$1 + num_gpu=$2 + batch_size=$3 + batch_per_gpu=`expr ${batch_size} / ${num_gpu}` + prefix=$4 + python $cfg --num_gpus=$num_gpu --batch_size=${batch_per_gpu} > logs/${prefix}-4gpu-${batch_size}.log 2>&1 +} + +if [ ! -d "logs" ]; then + mkdir logs +fi + +# alexnet +test alexnet_multi_gpu.py 4 512 alexnet +test alexnet_multi_gpu.py 4 1024 alexnet + +# googlenet +test googlenet_multi_gpu.py 4 512 alexnet +test googlenet_multi_gpu.py 4 1024 alexnet diff --git a/benchmark/tensorflow/image/smallnet_mnist_cifar.py b/benchmark/tensorflow/image/smallnet_mnist_cifar.py new file mode 100644 index 0000000000000000000000000000000000000000..b539d1bed06d02cc97764a22cb7d5517a99ea982 --- /dev/null +++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py @@ -0,0 +1,273 @@ +from six.moves import xrange # pylint: disable=redefined-builtin +from datetime import datetime +import math +import time + +import tensorflow.python.platform +import tensorflow as tf + +FLAGS = tf.app.flags.FLAGS + +tf.app.flags.DEFINE_integer('batch_size', 128, + """Batch size.""") +tf.app.flags.DEFINE_integer('num_batches', 100, + """Number of batches to run.""") +tf.app.flags.DEFINE_boolean('forward_only', False, + """Only run the forward pass.""") +tf.app.flags.DEFINE_boolean('forward_backward_only', False, + """Only run the forward-forward pass.""") +tf.app.flags.DEFINE_string('data_format', 'NCHW', + """The data format for Convnet operations. + Can be either NHWC or NCHW. + """) +tf.app.flags.DEFINE_boolean('log_device_placement', False, + """Whether to log device placement.""") + +parameters = [] + +conv_counter = 1 +pool_counter = 1 +affine_counter = 1 + +def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005, act=True): + global conv_counter + global parameters + name = 'conv' + str(conv_counter) + conv_counter += 1 + with tf.name_scope(name) as scope: + kernel = tf.Variable(tf.truncated_normal([kH, kW, nIn, nOut], + dtype=tf.float32, + stddev=1e-1), name='weights') + + if wd is not None: + weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') + tf.add_to_collection('losses', weight_decay) + + if FLAGS.data_format == 'NCHW': + strides = [1, 1, dH, dW] + else: + strides = [1, dH, dW, 1] + conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType, + data_format=FLAGS.data_format) + biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32), + trainable=True, name='biases') + bias = tf.reshape(tf.nn.bias_add(conv, biases, + data_format=FLAGS.data_format), + conv.get_shape()) + + conv1 = tf.nn.relu(bias, name=scope) if act else bias + + parameters += [kernel, biases] + + return conv1 + +def _affine(inpOp, nIn, nOut, wd=None, act=True): + global affine_counter + global parameters + name = 'affine' + str(affine_counter) + affine_counter += 1 + with tf.name_scope(name) as scope: + kernel = tf.Variable(tf.truncated_normal([nIn, nOut], + dtype=tf.float32, + stddev=1e-1), name='weights') + + if wd is not None: + weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') + tf.add_to_collection('losses', weight_decay) + + biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32), + trainable=True, name='biases') + + affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else tf.matmul(inpOp, kernel) + biases + + parameters += [kernel, biases] + + return affine1 + +def _mpool(inpOp, kH, kW, dH, dW, padding): + global pool_counter + global parameters + name = 'pool' + str(pool_counter) + pool_counter += 1 + if FLAGS.data_format == 'NCHW': + ksize = [1, 1, kH, kW] + strides = [1, 1, dH, dW] + else: + ksize = [1, kH, kW, 1] + strides = [1, dH, dW, 1] + return tf.nn.max_pool(inpOp, + ksize=ksize, + strides=strides, + padding=padding, + data_format=FLAGS.data_format, + name=name) + + +def _apool(inpOp, kH, kW, dH, dW, padding): + global pool_counter + global parameters + name = 'pool' + str(pool_counter) + pool_counter += 1 + if FLAGS.data_format == 'NCHW': + ksize = [1, 1, kH, kW] + strides = [1, 1, dH, dW] + else: + ksize = [1, kH, kW, 1] + strides = [1, dH, dW, 1] + return tf.nn.avg_pool(inpOp, + ksize=ksize, + strides=strides, + padding=padding, + data_format=FLAGS.data_format, + name=name) + +def _norm(name, l_input, lsize=4): + return tf.nn.lrn(l_input, lsize, bias=1.0, + alpha=0.001 / 9.0, + beta=0.75, name=name) + +def loss(logits, labels): + batch_size = tf.size(labels) + labels = tf.expand_dims(labels, 1) + indices = tf.expand_dims(tf.range(0, batch_size, 1), 1) + concated = tf.concat(1, [indices, labels]) + onehot_labels = tf.sparse_to_dense( + concated, tf.pack([batch_size, 10]), 1.0, 0.0) + cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, + onehot_labels, + name='xentropy') + loss = tf.reduce_mean(cross_entropy, name='xentropy_mean') + return loss + +def get_incoming_shape(incoming): + """ Returns the incoming data shape """ + if isinstance(incoming, tf.Tensor): + return incoming.get_shape().as_list() + elif type(incoming) in [np.array, list, tuple]: + return np.shape(incoming) + else: + raise Exception("Invalid incoming layer.") + +def inference(images): + conv1 = _conv (images, 3, 32, 5, 5, 1, 1, 'SAME') + pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME') + conv2 = _conv (pool1, 32, 32, 5, 5, 1, 1, 'SAME') + pool2 = _apool(conv2, 3, 3, 2, 2, 'SAME') + conv3 = _conv (pool2, 32, 64, 5, 5, 1, 1, 'SAME') + pool3 = _apool(conv3, 3, 3, 2, 2, 'SAME') + resh1 = tf.reshape(pool3, [-1, 64 * 4 * 4]) + affn1 = _affine(resh1, 64 * 4 * 4, 64) + affn2 = _affine(affn1, 64, 10, act=False) + + print ('conv1:', get_incoming_shape(conv1)) + print ('pool1:', get_incoming_shape(pool1)) + print ('conv2:', get_incoming_shape(conv2)) + print ('pool2:', get_incoming_shape(pool2)) + print ('conv3:', get_incoming_shape(conv3)) + print ('pool3:', get_incoming_shape(pool3)) + + return affn2 + + +def time_tensorflow_run(session, target, info_string): + num_steps_burn_in = 10 + total_duration = 0.0 + total_duration_squared = 0.0 + if not isinstance(target, list): + target = [target] + target_op = tf.group(*target) + for i in xrange(FLAGS.num_batches + num_steps_burn_in): + start_time = time.time() + _ = session.run(target_op) + duration = time.time() - start_time + if i > num_steps_burn_in: + if not i % 10: + print ('%s: step %d, duration = %.3f' % + (datetime.now(), i - num_steps_burn_in, duration)) + total_duration += duration + total_duration_squared += duration * duration + mn = total_duration / FLAGS.num_batches + vr = total_duration_squared / FLAGS.num_batches - mn * mn + sd = math.sqrt(vr) + print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' % + (datetime.now(), info_string, FLAGS.num_batches, mn, sd)) + +def run_benchmark(): + global parameters + with tf.Graph().as_default(): + # Generate some dummy images. + image_size = 32 + # Note that our padding definition is slightly different the cuda-convnet. + # In order to force the model to start with the same activations sizes, + # we add 3 to the image_size and employ VALID padding above. + if FLAGS.data_format == 'NCHW': + image_shape = [FLAGS.batch_size, 3, image_size, image_size] + else: + image_shape = [FLAGS.batch_size, image_size, image_size, 3] + + images = tf.get_variable('image', image_shape, + initializer=tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32), + dtype=tf.float32, + trainable=False) + + labels = tf.get_variable('label', [FLAGS.batch_size], + initializer=tf.constant_initializer(1), + dtype=tf.int32, + trainable=False) + + # Build a Graph that computes the logits predictions from the + # inference model. + last_layer = inference(images) + + objective = loss(last_layer, labels) + + # Compute gradients. + # opt = tf.train.GradientDescentOptimizer(0.001) + opt = tf.train.MomentumOptimizer(0.001, 0.9) + grads = opt.compute_gradients(objective) + global_step = tf.get_variable('global_step', [], + initializer=tf.constant_initializer(0.0, dtype=tf.float32), + trainable=False, dtype=tf.float32) + apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) + + # Track the moving averages of all trainable variables. + variable_averages = tf.train.ExponentialMovingAverage( + 0.9, global_step) + variables_averages_op = variable_averages.apply(tf.trainable_variables()) + + + # Build an initialization operation. + init = tf.initialize_all_variables() + + # Start running operations on the Graph. + sess = tf.Session(config=tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=FLAGS.log_device_placement)) + sess.run(init) + + run_forward = True + run_forward_backward = True + if FLAGS.forward_only and FLAGS.forward_backward_only: + raise ValueError("Cannot specify --forward_only and " + "--forward_backward_only at the same time.") + if FLAGS.forward_only: + run_forward_backward = False + elif FLAGS.forward_backward_only: + run_forward = False + + if run_forward: + # Run the forward benchmark. + time_tensorflow_run(sess, last_layer, "Forward") + + if run_forward_backward: + with tf.control_dependencies([apply_gradient_op, variables_averages_op]): + train_op = tf.no_op(name='train') + time_tensorflow_run(sess, [train_op, objective], "Forward-backward") + + +def main(_): + run_benchmark() + + +if __name__ == '__main__': + tf.app.run() diff --git a/benchmark/tensorflow/rnn/README.md b/benchmark/tensorflow/rnn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b5314d544608480a732f7d0d94ec69c53b4c8049 --- /dev/null +++ b/benchmark/tensorflow/rnn/README.md @@ -0,0 +1,5 @@ +You also should install tflearn: + +```bash +pip install tflearn +``` diff --git a/benchmark/tensorflow/rnn/reader.py b/benchmark/tensorflow/rnn/reader.py new file mode 100755 index 0000000000000000000000000000000000000000..0d8308046ed6543b218f604480e9630e6b4b1091 --- /dev/null +++ b/benchmark/tensorflow/rnn/reader.py @@ -0,0 +1,90 @@ +import os.path +import io +import numpy as np +import tensorflow as tf + +# tflearn +import tflearn +from tflearn.data_utils import to_categorical, pad_sequences +from tflearn.datasets import imdb + + +FLAGS = tf.app.flags.FLAGS + +class DataSet(object): + def __init__(self, data, labels): + assert data.shape[0] == labels.shape[0], ( + 'data.shape: %s labels.shape: %s' % (data.shape, + labels.shape)) + self._num_examples = data.shape[0] + + self._data = data + self._labels = labels + self._epochs_completed = 0 + self._index_in_epoch = 0 + + @property + def data(self): + return self._data + + @property + def labels(self): + return self._labels + + @property + def num_examples(self): + return self._num_examples + + @property + def epochs_completed(self): + return self._epochs_completed + + def next_batch(self, batch_size): + assert batch_size <= self._num_examples + + start = self._index_in_epoch + self._index_in_epoch += batch_size + if self._index_in_epoch > self._num_examples: + # Finished epoch + self._epochs_completed += 1 + # Shuffle the data + perm = np.arange(self._num_examples) + np.random.shuffle(perm) + self._data = self._data[perm] + self._labels = self._labels[perm] + # Start next epoch + start = 0 + self._index_in_epoch = batch_size + + end = self._index_in_epoch + + return self._data[start:end], self._labels[start:end] + + +def create_datasets(file_path, vocab_size=30000, val_fraction=0.0): + + # IMDB Dataset loading + train, test, _ = imdb.load_data(path=file_path, n_words=vocab_size, + valid_portion=val_fraction, sort_by_len=False) + trainX, trainY = train + testX, testY = test + + # Data preprocessing + # Sequence padding + trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.) + testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.) + # Converting labels to binary vectors + trainY = to_categorical(trainY, nb_classes=2) + testY = to_categorical(testY, nb_classes=2) + + train_dataset = DataSet(trainX, trainY) + + return train_dataset + + +def main(): + create_datasets('imdb.pkl') + + +if __name__ == "__main__": + main() diff --git a/benchmark/tensorflow/rnn/rnn.py b/benchmark/tensorflow/rnn/rnn.py new file mode 100755 index 0000000000000000000000000000000000000000..5377187f39141be6b9884d8a75c1c1772710c525 --- /dev/null +++ b/benchmark/tensorflow/rnn/rnn.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python +from six.moves import xrange # pylint: disable=redefined-builtin +import math +import time +import numpy as np +from datetime import datetime + +import reader +import tensorflow as tf +from tensorflow.python.ops import rnn + +FLAGS = tf.app.flags.FLAGS + +tf.app.flags.DEFINE_integer('batch_size', 128, + """Batch size.""") +tf.app.flags.DEFINE_integer('num_batches', 100, + """Number of batches to run.""") +tf.app.flags.DEFINE_integer('num_layers', 1, + """Number of batches to run.""") +tf.app.flags.DEFINE_integer('max_len', 100, + """Number of batches to run.""") +tf.app.flags.DEFINE_boolean('forward_only', False, + """Only run the forward pass.""") +tf.app.flags.DEFINE_boolean('forward_backward_only', False, + """Only run the forward-forward pass.""") +tf.app.flags.DEFINE_integer('hidden_size', 128, + """Number of batches to run.""") +tf.app.flags.DEFINE_integer('emb_size', 128, + """Number of batches to run.""") +tf.app.flags.DEFINE_boolean('log_device_placement', False, + """Whether to log device placement.""") + +VOCAB_SIZE=30000 +NUM_CLASS=2 + +def get_feed_dict(x_data, y_data=None): + feed_dict = {} + + if y_data is not None: + feed_dict[y_input] = y_data + + for i in xrange(x_data.shape[0]): + feed_dict[x_input[i]] = x_data[i, :, :] + + return feed_dict + +def get_incoming_shape(incoming): + """ Returns the incoming data shape """ + if isinstance(incoming, tf.Tensor): + return incoming.get_shape().as_list() + elif type(incoming) in [np.array, list, tuple]: + return np.shape(incoming) + else: + raise Exception("Invalid incoming layer.") + + +# Note input * W is done in LSTMCell, +# which is different from PaddlePaddle +def single_lstm(name, incoming, n_units, use_peepholes=True, + return_seq=False, return_state=False): + with tf.name_scope(name) as scope: + cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes) + output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32) + out = output if return_seq else output[-1] + return (out, _cell_state) if return_state else out + +def lstm(name, incoming, n_units, use_peepholes=True, + return_seq=False, return_state=False, num_layers=1): + with tf.name_scope(name) as scope: + lstm_cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes) + cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers) + initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32) + if not isinstance(incoming, list): + # if the input is embeding, the Tensor shape : [None, time_step, emb_size] + incoming = [tf.squeeze(input_, [1]) + for input_ in tf.split(1, FLAGS.max_len, incoming)] + outputs, state = tf.nn.rnn(cell, incoming, initial_state=initial_state, + dtype=tf.float32) + out = outputs if return_seq else outputs[-1] + return (out, _cell_state) if return_state else out + + +def embedding(name, incoming, vocab_size, emb_size): + with tf.name_scope(name) as scope: + #with tf.device("/cpu:0"): + embedding = tf.get_variable( + name+'_emb', [vocab_size, emb_size], dtype=tf.float32) + out = tf.nn.embedding_lookup(embedding, incoming) + return out + +def fc(name, inpOp, nIn, nOut, act=True): + with tf.name_scope(name) as scope: + kernel = tf.get_variable(name + '_w', [nIn, nOut], + initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + + biases = tf.get_variable(name + '_b', [nOut], + initializer=tf.constant_initializer(value=0.0, dtype=tf.float32), + dtype=tf.float32,trainable=True) + + net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \ + tf.matmul(inpOp, kernel) + biases + + return net + +def inference(seq): + net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size) + print "emb:", get_incoming_shape(net) + net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers) + print "lstm:", get_incoming_shape(net) + net = fc('fc1', net, FLAGS.hidden_size, 2) + return net + +def loss(logits, labels): + # one label index for one sample + labels = tf.cast(labels, tf.float32) + cross_entropy = tf.nn.softmax_cross_entropy_with_logits( + logits, labels, name='cross_entropy_per_example') + cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') + tf.add_to_collection('losses', cross_entropy_mean) + return tf.add_n(tf.get_collection('losses'), name='total_loss') + + +def time_tensorflow_run(session, target, x_input, y_input, info_string): + num_steps_burn_in = 50 + total_duration = 0.0 + total_duration_squared = 0.0 + if not isinstance(target, list): + target = [target] + target_op = tf.group(*target) + train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE) + for i in xrange(FLAGS.num_batches + num_steps_burn_in): + start_time = time.time() + data, label = train_dataset.next_batch(FLAGS.batch_size) + _ = session.run(target_op, feed_dict={x_input:data, y_input:label}) + duration = time.time() - start_time + if i > num_steps_burn_in: + if not i % 10: + print ('%s: step %d, duration = %.3f' % + (datetime.now(), i - num_steps_burn_in, duration)) + total_duration += duration + total_duration_squared += duration * duration + mn = total_duration / FLAGS.num_batches + vr = total_duration_squared / FLAGS.num_batches - mn * mn + sd = math.sqrt(vr) + print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' % + (datetime.now(), info_string, FLAGS.num_batches, mn, sd)) + + +def run_benchmark(): + with tf.Graph().as_default(): + global_step=0 + with tf.device('/cpu:0'): + global_step = tf.Variable(0, trainable=False) + with tf.device('/gpu:0'): + #x_input = tf.placeholder(tf.int32, [None, FLAGS.max_len], name="x_input") + #y_input = tf.placeholder(tf.int32, [None, NUM_CLASS], name="y_input") + x_input = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.max_len], name="x_input") + y_input = tf.placeholder(tf.int32, [FLAGS.batch_size, NUM_CLASS], name="y_input") + # Generate some dummy sequnce. + + + last_layer = inference(x_input) + + objective = loss(last_layer, y_input) + opt = tf.train.AdamOptimizer(0.001) + grads = opt.compute_gradients(objective) + apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) + + init = tf.initialize_all_variables() + sess = tf.Session(config=tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=FLAGS.log_device_placement)) + sess.run(init) + + run_forward = True + run_forward_backward = True + if FLAGS.forward_only and FLAGS.forward_backward_only: + raise ValueError("Cannot specify --forward_only and " + "--forward_backward_only at the same time.") + if FLAGS.forward_only: + run_forward_backward = False + elif FLAGS.forward_backward_only: + run_forward = False + + if run_forward: + time_tensorflow_run(sess, last_layer, x_input, y_input, "Forward") + + if run_forward_backward: + with tf.control_dependencies([apply_gradient_op]): + train_op = tf.no_op(name='train') + time_tensorflow_run(sess, [train_op, objective], x_input, y_input, "Forward-backward") + + +def main(_): + run_benchmark() + + +if __name__ == '__main__': + tf.app.run() + diff --git a/benchmark/tensorflow/rnn/rnn_multi_gpu.py b/benchmark/tensorflow/rnn/rnn_multi_gpu.py new file mode 100755 index 0000000000000000000000000000000000000000..97ba5d4c29672afe2756850430351b2abdeb20ca --- /dev/null +++ b/benchmark/tensorflow/rnn/rnn_multi_gpu.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python +from six.moves import xrange # pylint: disable=redefined-builtin +import re +import math +import time +import numpy as np +from datetime import datetime + +import reader +import tensorflow as tf +from tensorflow.python.ops import rnn + +FLAGS = tf.app.flags.FLAGS + +tf.app.flags.DEFINE_integer('batch_size', 64, + """Batch size.""") +tf.app.flags.DEFINE_integer('num_batches', 100, + """Number of batches to run.""") +tf.app.flags.DEFINE_integer('num_layers', 1, + """Number of batches to run.""") +tf.app.flags.DEFINE_integer('max_len', 100, + """Number of batches to run.""") +tf.app.flags.DEFINE_integer('hidden_size', 128, + """Number of batches to run.""") +tf.app.flags.DEFINE_integer('emb_size', 64, + """Number of batches to run.""") +tf.app.flags.DEFINE_boolean('log_device_placement', False, + """Whether to log device placement.""") +tf.app.flags.DEFINE_integer('num_gpus', 4, + """How many GPUs to use.""") + +VOCAB_SIZE=30000 +NUM_CLASS=2 + + +NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN=50000 +NUM_EPOCHS_PER_DECAY=50 +INITIAL_LEARNING_RATE = 0.1 +LEARNING_RATE_DECAY_FACTOR = 0.1 +TOWER_NAME = 'tower' + +train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE) + +def get_incoming_shape(incoming): + """ Returns the incoming data shape """ + if isinstance(incoming, tf.Tensor): + return incoming.get_shape().as_list() + elif type(incoming) in [np.array, list, tuple]: + return np.shape(incoming) + else: + raise Exception("Invalid incoming layer.") + + +# Note input * W is done in LSTMCell, +# which is different from PaddlePaddle +def single_lstm(name, incoming, n_units, use_peepholes=True, + return_seq=False, return_state=False): + with tf.name_scope(name) as scope: + cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes) + output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32) + out = output if return_seq else output[-1] + return (out, _cell_state) if return_state else out + + +def lstm(name, incoming, n_units, use_peepholes=True, + return_seq=False, return_state=False, num_layers=1): + with tf.name_scope(name) as scope: + lstm_cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes) + cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers) + initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32) + if not isinstance(incoming, list): + # if the input is embeding, the Tensor shape : [None, time_step, emb_size] + incoming = [tf.squeeze(input_, [1]) + for input_ in tf.split(1, FLAGS.max_len, incoming)] + outputs, state = tf.nn.rnn(cell, incoming, initial_state=initial_state, + dtype=tf.float32) + out = outputs if return_seq else outputs[-1] + return (out, _cell_state) if return_state else out + + +def embedding(name, incoming, vocab_size, emb_size): + with tf.name_scope(name) as scope: + #with tf.device("/cpu:0"): + embedding = tf.get_variable( + name+'_emb', [vocab_size, emb_size], dtype=tf.float32) + out = tf.nn.embedding_lookup(embedding, incoming) + return out + + +def fc(name, inpOp, nIn, nOut, act=True): + with tf.name_scope(name) as scope: + kernel = tf.get_variable(name + '_w', [nIn, nOut], + initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + + biases = tf.get_variable(name + '_b', [nOut], + initializer=tf.constant_initializer(value=0.0, dtype=tf.float32), + dtype=tf.float32,trainable=True) + + net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \ + tf.matmul(inpOp, kernel) + biases + + return net + + +def inference(seq): + net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size) + print "emb:", get_incoming_shape(net) + net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers) + print "lstm:", get_incoming_shape(net) + net = fc('fc1', net, FLAGS.hidden_size, 2) + return net + + +def loss(logits, labels): + # one label index for one sample + #labels = tf.cast(labels, tf.int64) + # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + # logits, labels, name='cross_entropy_per_example') + labels = tf.cast(labels, tf.float32) + cross_entropy = tf.nn.softmax_cross_entropy_with_logits( + logits, labels, name='cross_entropy_per_example') + cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') + tf.add_to_collection('losses', cross_entropy_mean) + return tf.add_n(tf.get_collection('losses'), name='total_loss') + + +def tower_loss(scope): + """Calculate the total loss on a single tower running the model. + Args: + scope: unique prefix string identifying the tower, e.g. 'tower_0' + Returns: + Tensor of shape [] containing the total loss for a batch of data + """ + data, label = train_dataset.next_batch(FLAGS.batch_size) + + # Build a Graph that computes the logits predictions from the + # inference model. + last_layer = inference(data) + + # Build the portion of the Graph calculating the losses. Note that we will + # assemble the total_loss using a custom function below. + #_ = loss(last_layer, label) + _ = loss(last_layer, label) + + # Assemble all of the losses for the current tower only. + losses = tf.get_collection('losses', scope) + + # Calculate the total loss for the current tower. + total_loss = tf.add_n(losses, name='total_loss') + + # Compute the moving average of all individual losses and the total loss. + loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') + loss_averages_op = loss_averages.apply(losses + [total_loss]) + + # Attach a scalar summary to all individual losses and the total loss; do the + # same for the averaged version of the losses. + for l in losses + [total_loss]: + # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training + # session. This helps the clarity of presentation on tensorboard. + loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name) + # Name each loss as '(raw)' and name the moving average version of the loss + # as the original loss name. + tf.scalar_summary(loss_name +' (raw)', l) + #tf.scalar_summary(loss_name, loss_averages.average(l)) + + with tf.control_dependencies([loss_averages_op]): + total_loss = tf.identity(total_loss) + return total_loss + + +def average_gradients(tower_grads): + """Calculate the average gradient for each shared variable across all towers. + Note that this function provides a synchronization point across all towers. + Args: + tower_grads: List of lists of (gradient, variable) tuples. The outer list + is over individual gradients. The inner list is over the gradient + calculation for each tower. + Returns: + List of pairs of (gradient, variable) where the gradient has been averaged + across all towers. + """ + average_grads = [] + for grad_and_vars in zip(*tower_grads): + # Note that each grad_and_vars looks like the following: + # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) + grads = [] + for g, _ in grad_and_vars: + # Add 0 dimension to the gradients to represent the tower. + expanded_g = tf.expand_dims(g, 0) + + # Append on a 'tower' dimension which we will average over below. + grads.append(expanded_g) + + # Average over the 'tower' dimension. + grad = tf.concat(0, grads) + grad = tf.reduce_mean(grad, 0) + + # Keep in mind that the Variables are redundant because they are shared + # across towers. So .. we will just return the first tower's pointer to + # the Variable. + v = grad_and_vars[0][1] + grad_and_var = (grad, v) + average_grads.append(grad_and_var) + return average_grads + +def time_tensorflow_run(session, target): + num_steps_burn_in = 80 + total_duration = 0.0 + total_duration_squared = 0.0 + for i in xrange(FLAGS.num_batches + num_steps_burn_in): + start_time = time.time() + _ = session.run(target, feed_dict={x_input:data, y_input:label}) + _, loss_value = session.run(target) + duration = time.time() - start_time + if i > num_steps_burn_in: + if not i % 10: + num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus + examples_per_sec = num_examples_per_step / duration + # sec_per_batch = duration / FLAGS.num_gpus + sec_per_batch = duration + + format_str = ('%s: step %d, loss= %.2f (%.1f examples/sec; %.3f ' + 'sec/batch batch_size= %d)') + print (format_str % + (datetime.now(), i - num_steps_burn_in, + loss_value, duration, sec_per_batch, num_examples_per_step)) + + total_duration += duration + total_duration_squared += duration * duration + + mn = total_duration / FLAGS.num_batches + vr = total_duration_squared / FLAGS.num_batches - mn * mn + sd = math.sqrt(vr) + print ('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' % + (datetime.now(), FLAGS.num_batches, mn, sd)) + +def run_benchmark(): + with tf.Graph().as_default(), tf.device('/cpu:0'): + # Create a variable to count the number of train() calls. This equals the + # number of batches processed * FLAGS.num_gpus. + global_step = tf.get_variable( + 'global_step', [], + initializer=tf.constant_initializer(0), trainable=False) + + # Calculate the learning rate schedule. + num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / + FLAGS.batch_size) + decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) + + # Create an optimizer that performs gradient descent. + opt = tf.train.AdamOptimizer(0.001) + + #train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE) + + # Calculate the gradients for each model tower. + tower_grads = [] + for i in xrange(FLAGS.num_gpus): + with tf.device('/gpu:%d' % i): + with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: + # Calculate the loss for one tower of the model. This function + # constructs the entire model but shares the variables across + # all towers. + loss = tower_loss(scope) + + # Reuse variables for the next tower. + tf.get_variable_scope().reuse_variables() + + # Retain the summaries from the final tower. + # summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) + + # Calculate the gradients for the batch of data on this tower. + grads = opt.compute_gradients(loss) + + # Keep track of the gradients across all towers. + tower_grads.append(grads) + + # We must calculate the mean of each gradient. Note that this is the + # synchronization point across all towers. + grads = average_gradients(tower_grads) + + # Apply the gradients to adjust the shared variables. + apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) + + # Group all updates to into a single train op. + train_op = tf.group(apply_gradient_op) + + # Build an initialization operation. + init = tf.initialize_all_variables() + + # Start running operations on the Graph. allow_soft_placement must be set to + # True to build towers on GPU, as some of the ops do not have GPU + # implementations. + sess = tf.Session(config=tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=FLAGS.log_device_placement)) + sess.run(init) + time_tensorflow_run(sess, [train_op, loss]) + + +def main(_): + run_benchmark() + + +if __name__ == '__main__': + tf.app.run() diff --git a/benchmark/tensorflow/rnn/run.sh b/benchmark/tensorflow/rnn/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..bb4c69cb95f965eff35f1c5a60376bf1e84f841b --- /dev/null +++ b/benchmark/tensorflow/rnn/run.sh @@ -0,0 +1,29 @@ +set -e + +function test() { + lstm_num=$1 + batch_size=$2 + hid_size=$3 + prefix=$4 + python rnn.py --num_layers=${lstm_num} --batch_size=$batch_size \ + --hidden_size=${hid_size} \ + --forward_backward_only=1 \ + > logs/1gpu-${lstm_num}lstm-batch${batch_size}-hid${hid_size}.log 2>&1 +} + +if [ ! -d "logs" ]; then + mkdir logs +fi + +#--lstm_num--batch_size--hidden_size--# +test 2 64 256 +test 2 64 512 +test 2 64 1280 + +test 2 128 256 +test 2 128 512 +test 2 128 1280 + +test 2 256 256 +test 2 256 512 +test 2 256 1280 diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh new file mode 100755 index 0000000000000000000000000000000000000000..f7f52e01e38d304bb3bf8185c53bd0da26014d3a --- /dev/null +++ b/benchmark/tensorflow/rnn/run_multi.sh @@ -0,0 +1,28 @@ +set -e + +function test() { + num_gpu=$1 + lstm_num=$2 + hid_size=$3 + batch_per_gpu=`expr ${batch_size} / ${num_gpu}` + batch_size=$4 + python rnn_multi_gpu.py --num_layers=${lstm_num} --batch_size=$batch_per_gpu \ + --num_gpus=${num_gpu} \ + --hidden_size=${hid_size} \ + --forward_backward_only=1 \ + > logs/${num_gpu}gpu-${lstm_num}lstm-hid${hid_size}-batch${batch_size}.log 2>&1 +} + +if [ ! -d "logs" ]; then + mkdir logs +fi + +#--num_gpus--lstm_num--hiddne_size--batch_size--# +test 4 2 256 128 +test 4 2 256 256 +test 4 2 256 512 + +test 4 2 512 128 +test 4 2 512 256 +test 4 2 512 512 +